Merge tag 'v3.8.3' into main branch libaom v3.8.3 release 2024-06-07 v3.8.3 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.8.2..v3.8.3 for all the commits in this release. - Bug Fixes * aomedia:2754, aomedia:3567: Ensure thread stack size is at least 256 KB * aomedia:3382, chromium:339877165: update codec config after svc/scale controls (CVE-2024-5493) * aomedia:3561: libaom-3.8.2 armv7 Android build failed * aomedia:3580: Allow g_timebase.num to be greater than g_timebase.den * Arm SVE build fixes. * av1_block_error_lp_neon: fix block_size param type Bug: aomedia:3581 Change-Id: I5c09e52e1e847452fd30a64504617e69924f89d4
diff --git a/.mailmap b/.mailmap index 7ddc582..e34285c 100644 --- a/.mailmap +++ b/.mailmap
@@ -40,6 +40,7 @@ Jacky Chen <jackychen@google.com> James Zern <jzern@google.com> <jzern@google.cOm> Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com> +Jian Zhou <zhoujian@fb.com> <zhoujian@google.com> Jim Bankoski <jimbankoski@google.com> Johann Koenig <johannkoenig@google.com> Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
diff --git a/AUTHORS b/AUTHORS index 509c0d1..32fe11a 100644 --- a/AUTHORS +++ b/AUTHORS
@@ -51,6 +51,7 @@ Dake He <dkhe@google.com> Damon Shen <yjshen@google.com> Dandan Ding <vickyddding@gmail.com> +Daniel Cheng <dcheng@chromium.org> Daniele Castagna <dcastagna@chromium.org> Daniel Kang <ddkang@google.com> Daniel Max Valenzuela <daniel.vt@samsung.com> @@ -94,6 +95,7 @@ Hamsalekha S <hamsalekha.s@ittiam.com> Hangyu Kuang <hkuang@google.com> Hanno Böck <hanno@hboeck.de> +Hari Limaye <hari.limaye@arm.com> Harish Mahendrakar <harish.mahendrakar@ittiam.com> Henrik Lundin <hlundin@google.com> Hien Ho <hienho@google.com> @@ -124,7 +126,7 @@ Jeff Petkau <jpet@chromium.org> Jerome Jiang <jianj@google.com> Jia Jia <jia.jia@linaro.org> -Jian Zhou <zhoujian@google.com> +Jian Zhou <zhoujian@fb.com> Jim Bankoski <jimbankoski@google.com> Jingning Han <jingning@google.com> Joe Young <joeyoung@google.com> @@ -216,6 +218,7 @@ Peter de Rivaz <peter.derivaz@gmail.com> Peter Kasting <pkasting@chromium.org> Philip Jägenstedt <philipj@opera.com> +Philippe Antoine <p.antoine@catenacyber.fr> Priit Laes <plaes@plaes.org> Qiu Jianlin <jianlin.qiu@intel.com> Rachel Barker <rachelbarker@google.com>
diff --git a/CHANGELOG b/CHANGELOG index 84bcba3..f364f0d 100644 --- a/CHANGELOG +++ b/CHANGELOG
@@ -15,6 +15,122 @@ * Arm SVE build fixes. * av1_block_error_lp_neon: fix block_size param type +2024-06-05 v3.9.1 + This release includes several bug fixes. This release is ABI + compatible with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.9.0..v3.9.1 for all the + commits in this release. + + - Bug Fixes + * aomedia:2754, aomedia:3567: Ensure thread stack size is at least + 256 KB + * b:330639949, oss-fuzz:68195: Increase scaling in linsolve_wiener + * Fix high target data rate overflow. + * aomedia:3509: Fix two UBSan errors in av1_rc_update_framerate() + * aomedia:3382, chromium:339877165: update codec config after + svc/scale controls (CVE-2024-5493) + * aomedia:3561: libaom-3.8.2 armv7 Android build failed + * aomedia:3571: {,highbd_}intrapred_neon.c: Avoid over-reads in z1 + and z3 preds + * aomedia:3578: libaom-3.9.0 undefined reference to + `aom_sub_pixel_variance16xh_ssse3' + * aomedia:3579: Use round for RC calculations in cyclic_refresh + * aomedia:3580: Allow g_timebase.num to be greater than + g_timebase.den + * oss-fuzz:68774: libaom:av1_dec_fuzzer: Segv on unknown address in + od_ec_dec_init + * Arm SVE build fixes. + * av1_block_error_lp_neon: fix block_size param type + * av1_block_error_lp_sve: fix block_size param type + +2024-04-09 v3.9.0 + This release includes new codec interfaces, compression efficiency and + perceptual improvements, speedup for RTC for both video and screen content, + and many bug fixes. This release is ABI compatible with the previous release. + + - New Features + * New codec control + * AV1E_SET_SVC_FRAME_DROP_MODE is added to configure the SVC encoder to + only drop spatial layers or the whole superframe. + * Active Map is fixed and tested for RTC. + * CONFIG_QUANT_MATRIX is added to disable quantization matrices when aom + decoder is disabled with CONFIG_AV1_DECODER. Reduces ~10% binary size when + both are disabled. + * libwebm is updated to libwebm-1.0.0.31-1-gaffd7f4. + + - Compression Efficiency Improvements + * RTC encoding improvements + * 1-2% BD-rate gain for screen content with temporal layers; 5% BD-rate + gain on scrolling content. + + - Perceptual Quality Improvements + * For RTC screen content + * Reduced color artifacts for RTC screen content + * Visual quality improved for scene changes for SVC with quality layers. + * Removed visual artifacts for speed 11 + + - Speedups: + * RTC Speed 11: aggressive speedup setting added for video mode, + resolutions <= VGA: ~30% faster than speed 10. + * 5-9% speed up for high bit-depth encoding with good mode on Arm, half of + which comes from SVE/SVE2 optimizations. + + - Other improvements + * Further improvements to global motion estimation. + * Documented minimum required SIMD support: SSE4.1 on x86, Neon on Arm. + * Remove unneeded SIMD functions, saving >100 KiB from binary size. + * Cleaned up and improved pattern_search. + * Added end-to-end c vs SIMD bit-exactness test. + * Added config flag to calc psnr using libvmaf peak: use a slightly + different peak value for PSNR (1020 and 2040 for 10- and 12-bit) + + - Bug Fixes + * Fuzzing bug fixes + * b/329485898 Null-dereference WRITE in av1_cdef_frame_mt + * b/329810149 Null-dereference WRITE in av1_cdef_copy_sb8_16 + * b/329813868 Ill in av1_cdef_frame_mt + * chromium:327882824 Null-dereference WRITE in av1_cdef_init_fb_row + * b/330014723 Null-dereference WRITE in + cdef_copy_rect8_16bit_to_16bit_avx2 + * b/310455204 Null-dereference WRITE in prepare_enc_workers + * b/314858909 Heap-buffer-overflow in aom_variance64x64_avx2 + * oss-fuzz:67132 av1_dec_fuzzer: ASSERT: (pbi->tile_count_minus_1 + 1) <= + (pbi->output_frame_width_in_tiles_minus_1 + 1) + * oss-fuzz:67058 av1_dec_fuzzer: ASSERT: i == 0 || tile_w == *w + * oss-fuzz:67161 av1_dec_fuzzer: ASSERT: i == 0 || tile_h == *h + * oss-fuzz:67059 av1_dec_fuzzer: Crash in mem_get_varsize + * oss-fuzz:67162 av1_dec_fuzzer: Use-of-uninitialized-value in + od_ec_decode_bool_q15 + * oss-fuzz:67184 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_init + * oss-fuzz:67216 av1_dec_fuzzer: Heap-buffer-overflow in + od_ec_dec_normalize + * oss-fuzz:67055 av1_dec_fuzzer: Heap-buffer-overflow in + get_ls_tile_buffers + * libaom library + * aomedia:3510 Large value of duration could cause encoder overflow + * chromium:328105513 Fix build conflicts between Abseil and libaom/libvpx + in Win ARM64 builds + * aomedia:3544 AV1/SharpnessTestLarge.SharpnessPSNRTest failures after + 59c592bb8 + * aomedia:3531 Exception encountered with PSNR calculation + * aomedia:3541 Can not compile correctly by CYGWIN + * chromium:41482688 heap-buffer-overflow write in vpx_img_read() + (tools_common.c) with VPX_IMG_FMT_NV12 + * aomedia:3521 Assertion failures on Arm in CNNTest.* in + av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon and + av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon + * aomedia:3486 C vs NEON mismatch in AV1 encoder + * aomedia:3536 Over write in highbd_dr_prediction_z3_upsample1_neon() + * aomedia:3276 Significant progress on ensuring all allocations are + checked + * aomedia:3491 heap-buffer-overflow encoding frames of size 256x256, + 512x512 in good quality usage mode using 4 threads + * aomedia:3322 PSNR number discrepancy + * aomedia:3493 Cmake generates garbage symbols for libaom_srcs.gni + * aomedia:3478 GCC 12.2.0 emits a -Wstringop-overflow warning on + aom/av1/encoder/motion_search_facade.c + * aomedia:3484 C vs NEON mismatch in AV1 encoder for high-bitdepth case + 2024-03-08 v3.8.2 This release includes several bug fixes. This release is ABI compatible with the last release. See @@ -55,6 +171,21 @@ * b/314858909: Do not use adaptive error estimate. * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. +2024-01-18 v3.7.2 + This release includes three bug fixes. This release is ABI compatible + with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.7.1..v3.7.2 for all the + commits in this release. + + - Bug Fixes + * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1' + failed. + * aomedia:3526: alloc_compressor_data() is called during every + aom_codec_control() call on the encoder. Note that this partially + reverts the fix for bug aomedia:3349. + * b/310457427 and b/310766628: Only use rec_sse in CBR mode. + * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. + 2023-11-30 v3.8.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations and many bug
diff --git a/CMakeLists.txt b/CMakeLists.txt index 4674396..328b724 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -58,9 +58,9 @@ # passed to libtool. # # We set SO_FILE_VERSION = [c-a].a.r -set(LT_CURRENT 11) -set(LT_REVISION 3) -set(LT_AGE 8) +set(LT_CURRENT 12) +set(LT_REVISION 1) +set(LT_AGE 9) math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}") set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}") unset(LT_CURRENT) @@ -323,11 +323,28 @@ endif() endif() -if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS) +if(CONFIG_AV1_ENCODER) list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h" "${AOM_ROOT}/av1/ratectrl_rtc.cc") add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES}) - target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom) + # aom_av1_rc calls libaom's internal functions, so it must be linked with the + # libaom static library. + if(BUILD_SHARED_LIBS) + target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static) + else() + target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom) + endif() + if(BUILD_SHARED_LIBS) + # On Windows, global symbols are not exported from a DLL by default. Enable + # the WINDOWS_EXPORT_ALL_SYMBOLS property to export all global symbols from + # the aom_av1_rc DLL on Windows, to match the default behavior on other + # platforms. + set_target_properties(aom_av1_rc PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + # The aom_av1_rc library and its header "av1/ratectrl_rtc.h" are not + # installed by the "install" command, so we don't need to worry about + # versioning the aom_av1_rc shared library. If we start to install the + # aom_av1_rc library, the library should be versioned. + endif() if(NOT WIN32 AND NOT APPLE) target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} m) endif() @@ -336,7 +353,7 @@ # List of object and static library targets. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom) -if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS) +if(CONFIG_AV1_ENCODER) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc) endif() if(BUILD_SHARED_LIBS) @@ -374,6 +391,7 @@ # if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES}) + add_library(aom_usage_exit OBJECT "${AOM_GEN_SRC_DIR}/usage_exit.c") set_property(TARGET ${example} PROPERTY FOLDER examples) if(CONFIG_AV1_DECODER) add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES}) @@ -486,19 +504,15 @@ add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c" $<TARGET_OBJECTS:aom_common_app_util> $<TARGET_OBJECTS:aom_encoder_app_util>) + add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc" + $<TARGET_OBJECTS:aom_common_app_util> + $<TARGET_OBJECTS:aom_encoder_app_util>) + target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc) # Maintain a list of encoder example targets. list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model photon_noise_table set_maps simple_encoder scalable_encoder - twopass_encoder) - - if(NOT BUILD_SHARED_LIBS) - add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc" - $<TARGET_OBJECTS:aom_common_app_util> - $<TARGET_OBJECTS:aom_encoder_app_util>) - target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc) - list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc) - endif() + svc_encoder_rtc twopass_encoder) endif() if(ENABLE_TOOLS) @@ -508,10 +522,10 @@ # aom_entropy_optimizer.c won't work on macos, but dragging in all the # helper machinery allows the link to succeed. add_executable(aom_entropy_optimizer - "${AOM_GEN_SRC_DIR}/usage_exit.c" "${AOM_ROOT}/tools/aom_entropy_optimizer.c" $<TARGET_OBJECTS:aom_common_app_util> - $<TARGET_OBJECTS:aom_encoder_app_util>) + $<TARGET_OBJECTS:aom_encoder_app_util> + $<TARGET_OBJECTS:aom_usage_exit>) # Maintain a list of encoder tool targets. list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer) @@ -661,12 +675,12 @@ if(ENABLE_TOOLS) if(CONFIG_AV1_DECODER) - add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.c" - "${AOM_ROOT}/tools/dump_obu.cc" + add_executable(dump_obu "${AOM_ROOT}/tools/dump_obu.cc" "${AOM_ROOT}/tools/obu_parser.cc" "${AOM_ROOT}/tools/obu_parser.h" $<TARGET_OBJECTS:aom_common_app_util> - $<TARGET_OBJECTS:aom_decoder_app_util>) + $<TARGET_OBJECTS:aom_decoder_app_util> + $<TARGET_OBJECTS:aom_usage_exit>) list(APPEND AOM_TOOL_TARGETS dump_obu) list(APPEND AOM_APP_TARGETS dump_obu) @@ -825,7 +839,8 @@ # Clang's AddressSanitizer documentation says "When linking shared libraries, # the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link # errors (don't use it with AddressSanitizer)." See - # https://clang.llvm.org/docs/AddressSanitizer.html#usage. + # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see + # https://clang.llvm.org/docs/MemorySanitizer.html#usage. if(NOT WIN32 AND NOT APPLE AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE)) @@ -940,7 +955,7 @@ foreach(var ${all_cmake_vars}) if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST" - AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER_") + AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER") list(APPEND aom_source_vars ${var}) endif() endforeach()
diff --git a/README.md b/README.md index 4e2eb27..f81e13e 100644 --- a/README.md +++ b/README.md
@@ -46,17 +46,23 @@ ### Prerequisites {#prerequisites} - 1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version - required. - 2. [Git](https://git-scm.com/). - 3. [Perl](https://www.perl.org/). - 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a - recent version of [nasm](http://www.nasm.us/). If you download yasm with - the intention to work with Visual Studio, please download win32.exe or - win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe. - 5. Building the documentation requires +1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version + required. +2. [Git](https://git-scm.com/). +3. A modern C compiler. gcc 6+, clang 7+, Microsoft Visual Studio 2019+ or + the latest version of MinGW-w64 (clang64 or ucrt toolchains) are + recommended. A C++ compiler is necessary to build the unit tests and some + features contained in the examples. +4. [Perl](https://www.perl.org/). +5. For x86 targets, [yasm](http://yasm.tortall.net/) or a recent version (2.14 + or later) of [nasm](http://www.nasm.us/). (If both yasm and nasm are + present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to + select nasm.) If you download yasm with the intention to work with Visual + Studio, please download win32.exe or win64.exe and rename it into yasm.exe. + DO NOT download or use vsyasm.exe. +6. Building the documentation requires [doxygen version 1.8.10 or newer](http://doxygen.org). - 6. Emscripten builds require the portable +7. Emscripten builds require the portable [EMSDK](https://kripken.github.io/emscripten-site/index.html). ### Get the code {#get-the-code}
diff --git a/aom/aom_decoder.h b/aom/aom_decoder.h index f3f11d8..229cf73 100644 --- a/aom/aom_decoder.h +++ b/aom/aom_decoder.h
@@ -30,7 +30,7 @@ extern "C" { #endif -#include "aom/aom_codec.h" +#include "aom/aom_codec.h" // IWYU pragma: export #include "aom/aom_frame_buffer.h" /*!\brief Current ABI version number
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h index 5d0bbe1..15cf21b 100644 --- a/aom/aom_encoder.h +++ b/aom/aom_encoder.h
@@ -30,7 +30,7 @@ extern "C" { #endif -#include "aom/aom_codec.h" +#include "aom/aom_codec.h" // IWYU pragma: export #include "aom/aom_external_partition.h" /*!\brief Current ABI version number @@ -637,6 +637,7 @@ /*!\brief Target data rate * * Target bitrate to use for this stream, in kilobits per second. + * Max allowed value is 2000000 */ unsigned int rc_target_bitrate; @@ -1044,6 +1045,11 @@ * Interface is not an encoder interface. * \retval #AOM_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. + * + * \note + * `duration` is of the unsigned long type, which can be 32 or 64 bits. + * `duration` must be less than or equal to UINT32_MAX so that its range is + * independent of the size of unsigned long. */ aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned long duration,
diff --git a/aom/aom_image.h b/aom/aom_image.h index d5f0c08..68fb312 100644 --- a/aom/aom_image.h +++ b/aom/aom_image.h
@@ -103,7 +103,8 @@ AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ -} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */ +} aom_transfer_characteristics_t; /**< alias for enum + aom_transfer_characteristics */ /*!\brief List of supported matrix coefficients */ typedef enum aom_matrix_coefficients { @@ -125,7 +126,7 @@ AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ -} aom_matrix_coefficients_t; +} aom_matrix_coefficients_t; /**< alias for enum aom_matrix_coefficients */ /*!\brief List of supported color range */ typedef enum aom_color_range { @@ -144,7 +145,8 @@ /**< sample, between two vertical samples */ AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ AOM_CSP_RESERVED = 3 /**< Reserved value */ -} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */ +} aom_chroma_sample_position_t; /**< alias for enum aom_chroma_sample_position + */ /*!\brief List of insert flags for Metadata * @@ -244,10 +246,13 @@ * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be @@ -267,10 +272,12 @@ * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of each row in the image - * (stride). + * (stride). Must not exceed 65536. * \param[in] img_data Storage to use for the image * * \return Returns a pointer to the initialized image descriptor. If the img @@ -291,12 +298,17 @@ * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * \param[in] size_align Alignment, in pixels, of the image width and height. + * Must not exceed 65536. * \param[in] border A border that is padded on four sides of the image. + * Must not exceed 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be
diff --git a/aom/aom_integer.h b/aom/aom_integer.h index d9bba09..ce65e98 100644 --- a/aom/aom_integer.h +++ b/aom/aom_integer.h
@@ -12,7 +12,7 @@ #define AOM_AOM_AOM_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ -#include <stddef.h> +#include <stddef.h> // IWYU pragma: export #if defined(_MSC_VER) #define AOM_FORCE_INLINE __forceinline @@ -33,8 +33,8 @@ #endif #endif // __cplusplus -#include <stdint.h> -#include <inttypes.h> +#include <stdint.h> // IWYU pragma: export +#include <inttypes.h> // IWYU pragma: export #if defined(__cplusplus) extern "C" {
diff --git a/aom/aomcx.h b/aom/aomcx.h index f061be3..edd8cd5 100644 --- a/aom/aomcx.h +++ b/aom/aomcx.h
@@ -1533,6 +1533,12 @@ */ AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164, + /*!\brief Codec control to set the frame drop mode for SVC, + * unsigned int parameter. The valid values are constants of the + * AOM_SVC_FRAME_DROP_MODE enum: AOM_LAYER_DROP or AOM_FULL_SUPERFRAME_DROP. + */ + AV1E_SET_SVC_FRAME_DROP_MODE = 165, + // Any new encoder control IDs should be added above. // Maximum allowed encoder control ID is 229. // No encoder control ID should be added below. @@ -1699,6 +1705,12 @@ int use_comp_pred[3]; /**<Compound reference flag. */ } aom_svc_ref_frame_comp_pred_t; +/*!brief Frame drop modes for spatial/quality layer SVC */ +typedef enum { + AOM_LAYER_DROP, /**< Any spatial layer can drop. */ + AOM_FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */ +} AOM_SVC_FRAME_DROP_MODE; + /*!\cond */ /*!\brief Encoder control function parameter type * @@ -2178,6 +2190,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_BITRATE_ONE_PASS_CBR, unsigned int) #define AOM_CTRL_AV1E_SET_BITRATE_ONE_PASS_CBR +AOM_CTRL_USE_TYPE(AV1E_SET_SVC_FRAME_DROP_MODE, unsigned int) +#define AOM_CTRL_AV1E_SET_SVC_FRAME_DROP_MODE + AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int) #define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
diff --git a/aom/aomdx.h b/aom/aomdx.h index 02ea195..2dd7bb3 100644 --- a/aom/aomdx.h +++ b/aom/aomdx.h
@@ -234,8 +234,11 @@ */ AV1D_GET_IMG_FORMAT, - /*!\brief Codec control function to get the size of the tile, unsigned int* - * parameter + /*!\brief Codec control function to get the width and height (in pixels) of + * the tiles in a tile list, unsigned int* parameter + * + * Tile width is in the high 16 bits of the output value, and tile height is + * in the low 16 bits of the output value. */ AV1D_GET_TILE_SIZE,
diff --git a/aom/internal/aom_codec_internal.h b/aom/internal/aom_codec_internal.h index fc2975d..b854a88 100644 --- a/aom/internal/aom_codec_internal.h +++ b/aom/internal/aom_codec_internal.h
@@ -395,10 +395,21 @@ #endif #endif +// Records the error code and error message. Does not call longjmp(). +void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, + const char *fmt, ...) LIBAOM_FORMAT_PRINTF(3, 4); + void aom_internal_error(struct aom_internal_error_info *info, aom_codec_err_t error, const char *fmt, ...) LIBAOM_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN; +// Calls aom_internal_error() with the error code and error message in `src`. +// `info` and `src` must not point to the same struct, i.e., self copy is +// prohibited. +void aom_internal_error_copy(struct aom_internal_error_info *info, + const struct aom_internal_error_info *src) + CLANG_ANALYZER_NORETURN; + void aom_merge_corrupted_flag(int *corrupted, int value); #ifdef __cplusplus } // extern "C"
diff --git a/aom/src/aom_codec.c b/aom/src/aom_codec.c index 4e75fcb..316cc6f 100644 --- a/aom/src/aom_codec.c +++ b/aom/src/aom_codec.c
@@ -13,6 +13,7 @@ * \brief Provides the high level interface to wrap decoder algorithms. * */ +#include <assert.h> #include <stdarg.h> #include <stdlib.h> @@ -129,10 +130,9 @@ return ctx->err; } -void aom_internal_error(struct aom_internal_error_info *info, - aom_codec_err_t error, const char *fmt, ...) { - va_list ap; - +LIBAOM_FORMAT_PRINTF(3, 0) +static void set_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, va_list ap) { info->error_code = error; info->has_detail = 0; @@ -140,15 +140,45 @@ size_t sz = sizeof(info->detail); info->has_detail = 1; - va_start(ap, fmt); vsnprintf(info->detail, sz - 1, fmt, ap); - va_end(ap); info->detail[sz - 1] = '\0'; } +} + +void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, + const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + set_error(info, error, fmt, ap); + va_end(ap); + + assert(!info->setjmp); +} + +void aom_internal_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + set_error(info, error, fmt, ap); + va_end(ap); if (info->setjmp) longjmp(info->jmp, info->error_code); } +void aom_internal_error_copy(struct aom_internal_error_info *info, + const struct aom_internal_error_info *src) { + assert(info != src); + assert(!src->setjmp); + + if (!src->has_detail) { + aom_internal_error(info, src->error_code, NULL); + } else { + aom_internal_error(info, src->error_code, "%s", src->detail); + } +} + void aom_merge_corrupted_flag(int *corrupted, int value) { *corrupted |= value; }
diff --git a/aom/src/aom_encoder.c b/aom/src/aom_encoder.c index 70e0b75..f188567 100644 --- a/aom/src/aom_encoder.c +++ b/aom/src/aom_encoder.c
@@ -23,6 +23,7 @@ #endif #include <limits.h> +#include <stdint.h> #include <string.h> #include "aom/aom_encoder.h" @@ -178,6 +179,10 @@ else if (img && ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) != 0) != ((ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) != 0)) { res = AOM_CODEC_INVALID_PARAM; +#if ULONG_MAX > UINT32_MAX + } else if (duration > UINT32_MAX) { + res = AOM_CODEC_INVALID_PARAM; +#endif } else { /* Execute in a normalized floating point environment, if the platform * requires it.
diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c index 8e94d5d..c29095c 100644 --- a/aom/src/aom_image.c +++ b/aom/src/aom_image.c
@@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> #include <limits.h> #include <stdlib.h> #include <string.h> @@ -36,11 +37,20 @@ /* NOTE: In this function, bit_depth is either 8 or 16 (if * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. */ - unsigned int h, w, s, xcs, ycs, bps, bit_depth; - unsigned int stride_in_bytes; + unsigned int xcs, ycs, bps, bit_depth; if (img != NULL) memset(img, 0, sizeof(aom_image_t)); + if (fmt == AOM_IMG_FMT_NONE) goto fail; + + /* Impose maximum values on input parameters so that this function can + * perform arithmetic operations without worrying about overflows. + */ + if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 || + stride_align > 65536 || size_align > 65536 || border > 65536) { + goto fail; + } + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -103,12 +113,17 @@ } /* Calculate storage sizes given the chroma subsampling */ - w = align_image_dimension(d_w, xcs, size_align); - h = align_image_dimension(d_h, ycs, size_align); + const unsigned int w = align_image_dimension(d_w, xcs, size_align); + assert(d_w <= w); + const unsigned int h = align_image_dimension(d_h, ycs, size_align); + assert(d_h <= h); - s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth; - s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1); - stride_in_bytes = s * bit_depth / 8; + uint64_t s = (uint64_t)w + 2 * border; + s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth; + s = s * bit_depth / 8; + s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); + if (s > INT_MAX) goto fail; + const int stride_in_bytes = (int)s; /* Allocate the new image */ if (!img) { @@ -167,7 +182,9 @@ /* Default viewport to entire image. (This aom_img_set_rect call always * succeeds.) */ - aom_img_set_rect(img, 0, 0, d_w, d_h, border); + int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border); + assert(ret == 0); + (void)ret; return img; fail: @@ -230,7 +247,7 @@ img->planes[AOM_PLANE_Y] = data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; - data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y]; + data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y]; unsigned int uv_border_h = border >> img->y_chroma_shift; unsigned int uv_x = x >> img->x_chroma_shift; @@ -242,14 +259,14 @@ } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; } else { img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_V]; img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; @@ -289,15 +306,15 @@ } int aom_img_plane_width(const aom_image_t *img, int plane) { - if (plane > 0 && img->x_chroma_shift > 0) - return (img->d_w + 1) >> img->x_chroma_shift; + if (plane > 0) + return (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift; else return img->d_w; } int aom_img_plane_height(const aom_image_t *img, int plane) { - if (plane > 0 && img->y_chroma_shift > 0) - return (img->d_h + 1) >> img->y_chroma_shift; + if (plane > 0) + return (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift; else return img->d_h; }
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake index f8f2cbb..750df42 100644 --- a/aom_dsp/aom_dsp.cmake +++ b/aom_dsp/aom_dsp.cmake
@@ -52,16 +52,12 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" "${AOM_ROOT}/aom_dsp/x86/convolve.h" "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c" @@ -145,6 +141,9 @@ "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c") + + list(APPEND AOM_DSP_COMMON_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_sve.c") endif() if(CONFIG_AV1_DECODER) @@ -200,15 +199,18 @@ "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c") + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_avx2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_sve.c") endif() list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64 @@ -227,6 +229,9 @@ "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_ssse3.asm") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -262,6 +267,7 @@ "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 @@ -292,6 +298,10 @@ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c") + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/avg_sve.c" + "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_sve.c" + "${AOM_ROOT}/aom_dsp/arm/sum_squares_sve.c") + if(CONFIG_AV1_HIGHBITDEPTH) list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" @@ -327,6 +337,10 @@ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/arm/highbd_sse_sve.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_variance_sve.c") endif() if(CONFIG_INTERNAL_STATS) @@ -484,6 +498,15 @@ "AOM_DSP_COMMON_INTRIN_NEON_I8MM") endif() + if(HAVE_SVE) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SVE") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SVE") + endif() + endif() + target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
diff --git a/aom_dsp/aom_dsp_rtcd.c b/aom_dsp/aom_dsp_rtcd.c index 1514bd6..0265dd1 100644 --- a/aom_dsp/aom_dsp_rtcd.c +++ b/aom_dsp/aom_dsp_rtcd.c
@@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); } +void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index c9b2682..b75bdc5 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -498,8 +498,8 @@ add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve_copy neon sse2 avx2/; -specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; -specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/aom_scaled_2d ssse3 neon/; @@ -509,10 +509,10 @@ specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/; add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; - specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/; + specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon sve/; add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; - specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/; + specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon sve/; } # @@ -776,30 +776,30 @@ specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/; add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum"; - specialize qw/aom_get_blk_sse_sum sse2 avx2 neon/; + specialize qw/aom_get_blk_sse_sum sse2 avx2 neon sve/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/aom_highbd_subtract_block sse2 neon/; add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; - specialize qw/aom_highbd_sse sse4_1 avx2 neon/; + specialize qw/aom_highbd_sse sse4_1 avx2 neon sve/; } # # Sum of Squares # add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; - specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/; + specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon sve/; add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; - specialize qw/aom_sum_squares_i16 sse2 neon/; + specialize qw/aom_sum_squares_i16 sse2 neon sve/; add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height"; specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/; add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height"; - specialize qw/aom_var_2d_u16 sse2 avx2 neon/; + specialize qw/aom_var_2d_u16 sse2 avx2 neon sve/; # # Single block SAD / Single block Avg SAD @@ -813,7 +813,7 @@ } add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum"; - specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/; + specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2 sve/; specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/; @@ -1087,7 +1087,7 @@ specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/; - specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/; + specialize qw/aom_sad_skip_16x4x4d avx2 neon neon_dotprod/; specialize qw/aom_sad_skip_8x32x4d sse2 neon/; specialize qw/aom_sad_skip_8x16x4d sse2 neon/; specialize qw/aom_sad_skip_8x8x4d sse2 neon/; @@ -1116,7 +1116,7 @@ specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/; - specialize qw/aom_sad16x4x3d neon neon_dotprod/; + specialize qw/aom_sad16x4x3d avx2 neon neon_dotprod/; specialize qw/aom_sad8x32x3d neon/; specialize qw/aom_sad4x16x3d neon/; @@ -1263,9 +1263,7 @@ specialize qw/aom_int_pro_col avx2 sse2 neon/; add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; - specialize qw/aom_vector_var avx2 sse4_1 neon/; - # TODO(kyslov@) bring back SSE2 by extending it to 128 block size - #specialize qw/aom_vector_var neon sse2/; + specialize qw/aom_vector_var avx2 sse4_1 neon sve/; # # hamadard transform and satd for implmenting temporal dependency model @@ -1352,16 +1350,24 @@ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon/; - specialize "aom_highbd_${bd}_mse16x8", qw/neon/; - specialize "aom_highbd_${bd}_mse8x16", qw/neon/; - specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon/; - } + if ($bd eq 8) { + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/; + } elsif ($bd eq 10) { + specialize "aom_highbd_${bd}_mse16x16", qw/avx2 sse2 neon sve/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; + } else { + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; + } - specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/; - specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/; - specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/; - specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/; + } } # @@ -1403,39 +1409,39 @@ specialize qw/aom_variance4x8 sse2 neon neon_dotprod/; specialize qw/aom_variance4x4 sse2 neon neon_dotprod/; - specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x8 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x8 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x16 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x4 neon ssse3/; + specialize qw/aom_sub_pixel_variance4x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance4x4 neon ssse3/; - specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 neon ssse3/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { specialize qw/aom_variance4x16 neon neon_dotprod sse2/; @@ -1445,18 +1451,18 @@ specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/; specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/; - specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x16 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 neon ssse3/; specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/; specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/; @@ -1495,66 +1501,66 @@ } } - specialize qw/aom_highbd_12_variance128x128 sse2 neon/; - specialize qw/aom_highbd_12_variance128x64 sse2 neon/; - specialize qw/aom_highbd_12_variance64x128 sse2 neon/; - specialize qw/aom_highbd_12_variance64x64 sse2 neon/; - specialize qw/aom_highbd_12_variance64x32 sse2 neon/; - specialize qw/aom_highbd_12_variance32x64 sse2 neon/; - specialize qw/aom_highbd_12_variance32x32 sse2 neon/; - specialize qw/aom_highbd_12_variance32x16 sse2 neon/; - specialize qw/aom_highbd_12_variance16x32 sse2 neon/; - specialize qw/aom_highbd_12_variance16x16 sse2 neon/; - specialize qw/aom_highbd_12_variance16x8 sse2 neon/; - specialize qw/aom_highbd_12_variance8x16 sse2 neon/; - specialize qw/aom_highbd_12_variance8x8 sse2 neon/; - specialize qw/aom_highbd_12_variance8x4 neon/; - specialize qw/aom_highbd_12_variance4x8 neon/; - specialize qw/aom_highbd_12_variance4x4 sse4_1 neon/; + specialize qw/aom_highbd_12_variance128x128 sse2 neon sve/; + specialize qw/aom_highbd_12_variance128x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x128 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x8 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x8 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x4 neon sve/; + specialize qw/aom_highbd_12_variance4x8 neon sve/; + specialize qw/aom_highbd_12_variance4x4 sse4_1 neon sve/; - specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance8x4 neon/; - specialize qw/aom_highbd_10_variance4x8 neon/; - specialize qw/aom_highbd_10_variance4x4 sse4_1 neon/; + specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x4 neon sve/; + specialize qw/aom_highbd_10_variance4x8 neon sve/; + specialize qw/aom_highbd_10_variance4x4 sse4_1 neon sve/; - specialize qw/aom_highbd_8_variance128x128 sse2 neon/; - specialize qw/aom_highbd_8_variance128x64 sse2 neon/; - specialize qw/aom_highbd_8_variance64x128 sse2 neon/; - specialize qw/aom_highbd_8_variance64x64 sse2 neon/; - specialize qw/aom_highbd_8_variance64x32 sse2 neon/; - specialize qw/aom_highbd_8_variance32x64 sse2 neon/; - specialize qw/aom_highbd_8_variance32x32 sse2 neon/; - specialize qw/aom_highbd_8_variance32x16 sse2 neon/; - specialize qw/aom_highbd_8_variance16x32 sse2 neon/; - specialize qw/aom_highbd_8_variance16x16 sse2 neon/; - specialize qw/aom_highbd_8_variance16x8 sse2 neon/; - specialize qw/aom_highbd_8_variance8x16 sse2 neon/; - specialize qw/aom_highbd_8_variance8x8 sse2 neon/; - specialize qw/aom_highbd_8_variance8x4 neon/; - specialize qw/aom_highbd_8_variance4x8 neon/; - specialize qw/aom_highbd_8_variance4x4 sse4_1 neon/; + specialize qw/aom_highbd_8_variance128x128 sse2 neon sve/; + specialize qw/aom_highbd_8_variance128x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x128 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x8 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x8 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x4 neon sve/; + specialize qw/aom_highbd_8_variance4x8 neon sve/; + specialize qw/aom_highbd_8_variance4x4 sse4_1 neon sve/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach $bd (8, 10, 12) { my $avx2 = ($bd == 10) ? "avx2" : ""; - specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance16x4" , qw/neon/; - specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance4x16" , qw/neon/; + specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance16x4" , qw/neon sve/; + specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance4x16" , qw/neon sve/; } } @@ -1773,7 +1779,7 @@ specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/; add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h"; - specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon/; + specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon sve/; } add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; @@ -1786,11 +1792,14 @@ # Flow estimation library if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { - add_proto qw/double av1_compute_cross_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2"; - specialize qw/av1_compute_cross_correlation sse4_1 avx2/; + add_proto qw/bool aom_compute_mean_stddev/, "const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev"; + specialize qw/aom_compute_mean_stddev sse4_1 avx2/; + + add_proto qw/double aom_compute_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2"; + specialize qw/aom_compute_correlation sse4_1 avx2/; add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; - specialize qw/aom_compute_flow_at_point sse4_1 neon/; + specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon sve/; } } # CONFIG_AV1_ENCODER
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h index ab950ca..69da8f2 100644 --- a/aom_dsp/aom_simd.h +++ b/aom_dsp/aom_simd.h
@@ -24,12 +24,10 @@ #define SIMD_CHECK 1 // Sanity checks in C equivalents -#if HAVE_NEON -#include "simd/v256_intrinsics_arm.h" // VS compiling for 32 bit targets does not support vector types in // structs as arguments, which makes the v256 type of the intrinsics // hard to support, so optimizations for this target are disabled. -#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) +#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) #include "simd/v256_intrinsics_x86.h" #else #include "simd/v256_intrinsics.h"
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c index c8ee780..193844d 100644 --- a/aom_dsp/arm/aom_convolve8_neon.c +++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -20,6 +20,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_convolve8_neon.h" +#include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" @@ -31,14 +33,14 @@ const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); - int16x4_t sum; - sum = vmul_lane_s16(s0, filter_lo, 0); + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); sum = vmla_lane_s16(sum, s1, filter_lo, 1); sum = vmla_lane_s16(sum, s2, filter_lo, 2); sum = vmla_lane_s16(sum, s5, filter_hi, 1); sum = vmla_lane_s16(sum, s6, filter_hi, 2); sum = vmla_lane_s16(sum, s7, filter_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); return sum; @@ -51,72 +53,61 @@ const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); - int16x8_t sum; - sum = vmulq_lane_s16(s0, filter_lo, 0); + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { +static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { const int16x8_t filter = vld1q_s16(filter_x); - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)x_step_q4; - (void)filter_y; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1); - if (h == 4) { - uint8x8_t t0, t1, t2, t3, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); src += 7; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_elems_inplace_u8_4x4(&d01, &d23); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d23, 0); - store_u8_4x1(dst + 2 * dst_stride, d01, 1); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -125,98 +116,96 @@ s4 = s8; s5 = s9; s6 = s10; + src += 4; dst += 4; w -= 4; } while (w != 0); } else { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); - store_u8_4x1(dst + 0 * dst_stride, d0, 0); - store_u8_4x1(dst + 1 * dst_stride, d1, 0); - store_u8_4x1(dst + 2 * dst_stride, d2, 0); - store_u8_4x1(dst + 3 * dst_stride, d3, 0); - store_u8_4x1(dst + 4 * dst_stride, d0, 1); - store_u8_4x1(dst + 5 * dst_stride, d1, 1); - store_u8_4x1(dst + 6 * dst_stride, d2, 1); - store_u8_4x1(dst + 7 * dst_stride, d3, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0); + store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1); + store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2); + store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { - uint8x8_t d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - int width; - const uint8_t *s; - uint8_t *d; - do { - load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; - width = w; - s = src + 7; - d = dst; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -230,6 +219,7 @@ s4 = s12; s5 = s13; s6 = s14; + s += 8; d += 8; width -= 8; @@ -242,55 +232,141 @@ } } -void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const int16x8_t filter = vld1q_s16(filter_y); +static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); + if (w == 4) { + do { + uint8x8_t t01[4]; + + t01[0] = load_unaligned_u8(src + 0, (int)src_stride); + t01[1] = load_unaligned_u8(src + 1, (int)src_stride); + t01[2] = load_unaligned_u8(src + 2, (int)src_stride); + t01[3] = load_unaligned_u8(src + 3, (int)src_stride); + + int16x8_t s01[4]; + s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); + s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); + s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); + s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); + + uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + do { + uint8x8_t t0[4], t1[4]; + load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); + load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); + + int16x8_t s0[4], s1[4]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); + s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); + s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); + s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); + + s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); + s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); + s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); + s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + + store_u8_8x2(d, dst_stride, d0, d1); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} + +void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); - (void)filter_x; (void)x_step_q4; + (void)filter_y; (void)y_step_q4; - src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + src -= ((SUBPEL_TAPS / 2) - 1); + + int filter_taps = get_filter_taps_convolve8(filter_x); + + if (filter_taps == 2) { + convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, + h); + } else if (filter_taps == 4) { + convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w, + h); + } else { + convolve8_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h); + } +} + +static INLINE void convolve8_vert_8tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_y, int w, + int h) { + const int16x8_t filter = vld1q_s16(filter_y); if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); - s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); - s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); src += 7 * src_stride; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -299,42 +375,40 @@ s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int height; - const uint8_t *s; - uint8_t *d; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - height = h; - s = src + 7 * src_stride; - d = dst; + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int height = h; + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; do { load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -345,6 +419,7 @@ s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; d += 4 * dst_stride; height -= 4; @@ -355,3 +430,30 @@ } while (w != 0); } } + +void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)filter_x; + (void)x_step_q4; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + + int filter_taps = get_filter_taps_convolve8(filter_y); + + if (filter_taps == 2) { + convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, + filter_y, w, h); + } else if (filter_taps == 4) { + convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, + filter_y, w, h); + } else { + convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h); + } +}
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h new file mode 100644 index 0000000..b523c41 --- /dev/null +++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -0,0 +1,285 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_ +#define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_ + +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { + // Bilinear filter values are all positive. + const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]); + const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]); + + if (w == 4) { + do { + uint8x8_t s0 = + load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride); + uint8x8_t s1 = + load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride); + uint8x8_t s2 = + load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride); + uint8x8_t s3 = + load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride); + + uint16x8_t sum0 = vmull_u8(s0, f0); + sum0 = vmlal_u8(sum0, s1, f1); + uint16x8_t sum1 = vmull_u8(s2, f0); + sum1 = vmlal_u8(sum1, s3, f1); + + uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); + uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else if (w == 8) { + do { + uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0); + uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1); + uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0); + uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1); + + uint16x8_t sum0 = vmull_u8(s0, f0); + sum0 = vmlal_u8(sum0, s1, f1); + uint16x8_t sum1 = vmull_u8(s2, f0); + sum1 = vmlal_u8(sum1, s3, f1); + + uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); + uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); + + vst1_u8(dst + 0 * dst_stride, d0); + vst1_u8(dst + 1 * dst_stride, d1); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + do { + uint8x16_t s0 = vld1q_u8(s + 0); + uint8x16_t s1 = vld1q_u8(s + 1); + + uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0); + sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1); + uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0); + sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1); + + uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); + uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h > 0); + } +} + +static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t filter) { + int16x8_t sum = vmulq_lane_s16(s0, filter, 0); + sum = vmlaq_lane_s16(sum, s1, filter, 1); + sum = vmlaq_lane_s16(sum, s2, filter, 2); + sum = vmlaq_lane_s16(sum, s3, filter, 3); + + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve8_vert_4tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_y, int w, + int h) { + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1); + + if (w == 4) { + uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride); + uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride); + + int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + + src += 2 * src_stride; + + do { + uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride); + uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride); + uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride); + uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride); + + int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23)); + int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34)); + int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45)); + int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56)); + + uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter); + uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + s01 = s45; + s12 = s56; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + uint8x8_t t0, t1, t2; + load_u8_8x3(src, src_stride, &t0, &t1, &t2); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + + int height = h; + const uint8_t *s = src + 3 * src_stride; + uint8_t *d = dst; + + do { + uint8x8_t t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter); + uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter); + uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter); + uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve8_vert_2tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_y, int w, + int h) { + // Bilinear filter values are all positive. + uint8x8_t f0 = vdup_n_u8((uint8_t)filter_y[3]); + uint8x8_t f1 = vdup_n_u8((uint8_t)filter_y[4]); + + if (w == 4) { + do { + uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride); + uint8x8_t s1 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride); + uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride); + uint8x8_t s3 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride); + + uint16x8_t sum0 = vmull_u8(s0, f0); + sum0 = vmlal_u8(sum0, s1, f1); + uint16x8_t sum1 = vmull_u8(s2, f0); + sum1 = vmlal_u8(sum1, s3, f1); + + uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); + uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else if (w == 8) { + do { + uint8x8_t s0, s1, s2; + load_u8_8x3(src, src_stride, &s0, &s1, &s2); + + uint16x8_t sum0 = vmull_u8(s0, f0); + sum0 = vmlal_u8(sum0, s1, f1); + uint16x8_t sum1 = vmull_u8(s1, f0); + sum1 = vmlal_u8(sum1, s2, f1); + + uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); + uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); + + vst1_u8(dst + 0 * dst_stride, d0); + vst1_u8(dst + 1 * dst_stride, d1); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + do { + uint8x16_t s0 = vld1q_u8(s + 0 * src_stride); + uint8x16_t s1 = vld1q_u8(s + 1 * src_stride); + + uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0); + sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1); + uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0); + sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1); + + uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); + uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h > 0); + } +} + +#endif // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c index e565414..7219570 100644 --- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c +++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -20,150 +20,121 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_convolve8_neon.h" +#include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +// Filter values always sum to 128. +#define FILTER_WEIGHT 128 + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, - const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - int32x4_t sum; +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1); - - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, - const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); - sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { +static INLINE void convolve8_horiz_8tap_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)x_step_q4; - (void)filter_y; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1); if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { - width = w; - s = src; - d = dst; + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -178,175 +149,287 @@ } } -static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + // (Divide by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); + + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + // (Divide by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve8_horiz_4tap_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) { + const int16x4_t x_filter = vld1_s16(filter_x + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + if (width == 4) { + const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int w = width; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + int filter_taps = get_filter_taps_convolve8(filter_x); + + if (filter_taps == 2) { + convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, + h); + } else if (filter_taps == 4) { + convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, + filter_x, w, h); + } else { + convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride, + filter_x, w, h); + } +} + +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; + + *b = vreinterpretq_s8_s16(a0123); } static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ + int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8x2_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); + + *b0 = vreinterpretq_s8_s16(a0123.val[0]); + *b1 = vreinterpretq_s8_s16(a0123.val[1]); } -static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filter) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum; +static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, samples_lo, filter, 0); - sum = vdotq_lane_s32(sum, samples_hi, filter, 1); + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filter) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; +static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0); - sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0); - sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1); + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { +static INLINE void convolve8_vert_8tap_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); int8x16x2_t samples_LUT; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)filter_x; - (void)x_step_q4; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + int8x16_t s4567, s5678, s6789, s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -357,67 +440,47 @@ h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); @@ -430,19 +493,19 @@ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filter); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filter); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filter); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filter); + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -462,3 +525,31 @@ } while (w != 0); } } + +void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)filter_x; + (void)x_step_q4; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + + int filter_taps = get_filter_taps_convolve8(filter_y); + + if (filter_taps == 2) { + convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, + filter_y, w, h); + } else if (filter_taps == 4) { + convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, + filter_y, w, h); + } else { + convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y, + w, h); + } +}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c index d778e8a..34bfe01 100644 --- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c +++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -15,141 +15,113 @@ #include <string.h> #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_convolve8_neon.h" +#include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples, - const int8x8_t filter, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - int32x4_t sum; +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); - sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1); - - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, - const int8x8_t filter, - const uint8x16x3_t permute_tbl) { - uint8x16_t permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); - sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0); - sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { +static INLINE void convolve8_horiz_8tap_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)x_step_q4; - (void)filter_y; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1); if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filter, perm_tbl); - t1 = convolve8_4_usdot(s1, filter, perm_tbl); - t2 = convolve8_4_usdot(s2, filter, perm_tbl); - t3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { - width = w; - s = src; - d = dst; + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -164,153 +136,256 @@ } } +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); + + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve8_horiz_4tap_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) { + const int16x4_t x_filter = vld1_s16(filter_x + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + if (width == 4) { + const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl); + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve4_4_h(s0, filter, perm_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, perm_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, perm_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, perm_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + + do { + int w = width; + const uint8_t *s = src; + uint8_t *d = dst; + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve4_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + int filter_taps = get_filter_taps_convolve8(filter_x); + + if (filter_taps == 2) { + convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, + h); + } else if (filter_taps == 4) { + convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride, + filter_x, w, h); + } else { + convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x, + w, h); + } +} + static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; + + *b = vreinterpretq_u8_u16(a0123); } static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8x2_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); + + *b0 = vreinterpretq_u8_u16(a0123.val[0]); + *b1 = vreinterpretq_u8_u16(a0123.val[1]); } -static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filter) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum; +static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0); - sum = vusdotq_lane_s32(sum, samples_hi, filter, 1); - - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, - const uint8x16_t samples0_hi, - const uint8x16_t samples1_lo, - const uint8x16_t samples1_hi, - const int8x8_t filter) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; +static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0); - sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0); - sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1); + // First 4 output values. + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { +static INLINE void convolve8_vert_8tap_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); uint8x16x2_t samples_LUT; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)filter_x; - (void)x_step_q4; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + uint8x16_t s4567, s5678, s6789, s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - d3 = convolve8_4_usdot_partial(s3456, s78910, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -321,52 +396,33 @@ h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); @@ -379,19 +435,19 @@ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filter); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filter); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filter); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filter); + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -411,3 +467,31 @@ } while (w != 0); } } + +void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)filter_x; + (void)x_step_q4; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + + int filter_taps = get_filter_taps_convolve8(filter_y); + + if (filter_taps == 2) { + convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, + filter_y, w, h); + } else if (filter_taps == 4) { + convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, + filter_y, w, h); + } else { + convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w, + h); + } +}
diff --git a/aom_dsp/arm/aom_filter.h b/aom_dsp/arm/aom_filter.h new file mode 100644 index 0000000..9972d06 --- /dev/null +++ b/aom_dsp/arm/aom_filter.h
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_FILTER_H_ +#define AOM_AOM_DSP_ARM_AOM_FILTER_H_ + +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE int get_filter_taps_convolve8(const int16_t *filter) { + if (filter[0] | filter[7]) { + return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; +} + +#endif // AOM_AOM_DSP_ARM_AOM_FILTER_H_
diff --git a/aom_dsp/arm/aom_neon_sve2_bridge.h b/aom_dsp/arm/aom_neon_sve2_bridge.h new file mode 100644 index 0000000..6e7d2d6 --- /dev/null +++ b/aom_dsp/arm/aom_neon_sve2_bridge.h
@@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ +#define AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ + +#include <arm_neon_sve_bridge.h> + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +// We can access instructions exclusive to the SVE2 instruction set from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1, + uint16x8_t tbl) { + svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), + svset_neonq_s16(svundef_s16(), s1)); + return svget_neonq_s16( + svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_
diff --git a/aom_dsp/arm/aom_neon_sve_bridge.h b/aom_dsp/arm/aom_neon_sve_bridge.h new file mode 100644 index 0000000..3da80e2 --- /dev/null +++ b/aom_dsp/arm/aom_neon_sve_bridge.h
@@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ +#define AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ + +#include <arm_neon_sve_bridge.h> + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +// We can access instructions exclusive to the SVE instruction set from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x, + uint16x8_t y) { + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + +static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +#define aom_svdot_lane_s16(sum, s0, f, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \ + svset_neonq_s16(svundef_s16(), s0), \ + svset_neonq_s16(svundef_s16(), f), lane)) + +static INLINE uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) { + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), s), + svset_neonq_u16(svundef_u16(), tbl))); +} + +static INLINE int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) { + return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s), + svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_
diff --git a/aom_dsp/arm/avg_sve.c b/aom_dsp/arm/avg_sve.c new file mode 100644 index 0000000..57a5465 --- /dev/null +++ b/aom_dsp/arm/avg_sve.c
@@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) { + assert(bwl >= 2 && bwl <= 5); + int width = 4 << bwl; + + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + + do { + int16x8_t r0 = vld1q_s16(ref); + int16x8_t s0 = vld1q_s16(src); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff0 = vsubq_s16(r0, s0); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + v_mean[0] = vaddq_s16(v_mean[0], diff0); + + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0); + + int16x8_t r1 = vld1q_s16(ref + 8); + int16x8_t s1 = vld1q_s16(src + 8); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff1 = vsubq_s16(r1, s1); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + v_mean[1] = vaddq_s16(v_mean[1], diff1); + + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1); + + ref += 16; + src += 16; + width -= 16; + } while (width != 0); + + // Dynamic range [0, 65280], 16 (unsigned) bits. + const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1]))); + const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); + + // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits. + return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2))); +}
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c index 7b1b66a..48ff683 100644 --- a/aom_dsp/arm/blend_a64_mask_neon.c +++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -20,8 +20,9 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" -uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b, - uint16x8_t round_offset) { +static uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, + uint16x8_t b, + uint16x8_t round_offset) { const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a)); @@ -91,7 +92,7 @@ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -139,7 +140,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -181,7 +182,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -225,7 +226,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -293,7 +294,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -358,7 +359,7 @@ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -418,7 +419,7 @@ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -479,7 +480,7 @@ uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/blk_sse_sum_sve.c b/aom_dsp/arm/blk_sse_sum_sve.c new file mode 100644 index 0000000..f538346 --- /dev/null +++ b/aom_dsp/arm/blk_sse_sum_sve.c
@@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum = vdupq_n_s32(0); + int64x2_t sse = vdupq_n_s64(0); + + do { + int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride)); + + sum = vpadalq_s16(sum, d); + + sse = aom_sdotq_s16(sse, d, d); + + data += 2 * stride; + bh -= 2; + } while (bh != 0); + + *x_sum = vaddvq_s32(sum); + *x2_sum = vaddvq_s64(sse); +} + +static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int16x8_t d0 = vld1q_s16(data); + int16x8_t d1 = vld1q_s16(data + stride); + + sum[0] = vpadalq_s16(sum[0], d0); + sum[1] = vpadalq_s16(sum[1], d1); + + sse[0] = aom_sdotq_s16(sse[0], d0, d0); + sse[1] = aom_sdotq_s16(sse[1], d1, d1); + + data += 2 * stride; + bh -= 2; + } while (bh != 0); + + *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); + *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride, + int bw, int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int j = bw; + const int16_t *data_ptr = data; + do { + int16x8_t d0 = vld1q_s16(data_ptr); + int16x8_t d1 = vld1q_s16(data_ptr + 8); + + sum[0] = vpadalq_s16(sum[0], d0); + sum[1] = vpadalq_s16(sum[1], d1); + + sse[0] = aom_sdotq_s16(sse[0], d0, d0); + sse[1] = aom_sdotq_s16(sse[1], d1, d1); + + data_ptr += 16; + j -= 16; + } while (j != 0); + + data += stride; + } while (--bh != 0); + + *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); + *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + if (bw == 4) { + get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum); + } else if (bw == 8) { + get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum); + } else { + assert(bw % 16 == 0); + get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum); + } +}
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c index fb4cda7..c87acfb 100644 --- a/aom_dsp/arm/fwd_txfm_neon.c +++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -12,6 +12,7 @@ #include <arm_neon.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/arm/mem_neon.h" @@ -115,6 +116,7 @@ vst1q_s16(final_output + 1 * 8, out_23); } +#if CONFIG_INTERNAL_STATS void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { // stage 1 int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); @@ -302,20 +304,4 @@ vst1q_s16(&final_output[7 * 8], input_7); } } - -void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); - output[1] = 0; - } -} +#endif // CONFIG_INTERNAL_STATS
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c index bdd2177..8b03e91 100644 --- a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c +++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -67,7 +67,7 @@ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -83,7 +83,7 @@ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); - store_unaligned_u16_2x2(dst, dst_stride, blend); + store_u16x2_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c index 36d763a..90b44fc 100644 --- a/aom_dsp/arm/highbd_blend_a64_mask_neon.c +++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -91,7 +91,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 2 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -139,7 +139,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 4 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -182,7 +182,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 2 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -227,7 +227,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 4 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -325,7 +325,7 @@ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -373,7 +373,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -416,7 +416,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -460,7 +460,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c index ea3d655..1292e20 100644 --- a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c +++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -70,7 +70,7 @@ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; @@ -90,7 +90,7 @@ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); - store_unaligned_u16_2x2(dst, dst_stride, blend); + store_u16x2_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride;
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c index e25438c..99ad0ba 100644 --- a/aom_dsp/arm/highbd_convolve8_neon.c +++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -19,199 +19,208 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_filter.h" +#include "aom_dsp/arm/highbd_convolve8_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -static INLINE int32x4_t highbd_convolve8_4_s32( - const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, - const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, - const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { - const int16x4_t y_filter_lo = vget_low_s16(y_filter); - const int16x4_t y_filter_hi = vget_high_s16(y_filter); +static INLINE uint16x4_t +highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter, const uint16x4_t max) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); - int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0); - sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); - sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); - sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); - sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); - sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); - sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); - sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + int32x4_t sum = vmull_lane_s16(s0, filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, filter_hi, 3); - return sum; + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + + return vmin_u16(res, max); } -static INLINE uint16x4_t highbd_convolve8_4_s32_s16( - const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, - const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, - const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { - int32x4_t sum = - highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, const uint16x8_t max) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); - return vqrshrun_n_s32(sum, FILTER_BITS); + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_hi, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_hi, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + + return vminq_u16(res, max); } -static INLINE int32x4_t highbd_convolve8_horiz4_s32( - const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) { - const int16x8_t s2 = vextq_s16(s0, s1, 1); - const int16x8_t s3 = vextq_s16(s0, s1, 2); - const int16x8_t s4 = vextq_s16(s0, s1, 3); - const int16x4_t s0_lo = vget_low_s16(s0); - const int16x4_t s1_lo = vget_low_s16(s2); - const int16x4_t s2_lo = vget_low_s16(s3); - const int16x4_t s3_lo = vget_low_s16(s4); - const int16x4_t s4_lo = vget_high_s16(s0); - const int16x4_t s5_lo = vget_high_s16(s2); - const int16x4_t s6_lo = vget_high_s16(s3); - const int16x4_t s7_lo = vget_high_s16(s4); - - return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo, - s7_lo, x_filter_0_7); -} - -static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16( - const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) { - int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7); - - return vqrshrun_n_s32(sum, FILTER_BITS); -} - -static INLINE void highbd_convolve8_8_s32( - const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, - const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, - int32x4_t *sum0, int32x4_t *sum1) { - const int16x4_t y_filter_lo = vget_low_s16(y_filter); - const int16x4_t y_filter_hi = vget_high_s16(y_filter); - - *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2); - *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3); - - *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2); - *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3); -} - -static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0, - const int16x8_t s0_hi, - const int16x8_t x_filter_0_7, - int32x4_t *sum0, - int32x4_t *sum1) { - const int16x8_t s1 = vextq_s16(s0, s0_hi, 1); - const int16x8_t s2 = vextq_s16(s0, s0_hi, 2); - const int16x8_t s3 = vextq_s16(s0, s0_hi, 3); - const int16x8_t s4 = vextq_s16(s0, s0_hi, 4); - const int16x8_t s5 = vextq_s16(s0, s0_hi, 5); - const int16x8_t s6 = vextq_s16(s0, s0_hi, 6); - const int16x8_t s7 = vextq_s16(s0, s0_hi, 7); - - highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0, - sum1); -} - -static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16( - const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) { - int32x4_t sum0, sum1; - highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1); - - return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), - vqrshrun_n_s32(sum1, FILTER_BITS)); -} - -static INLINE uint16x8_t highbd_convolve8_8_s32_s16( - const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, - const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) { - int32x4_t sum0; - int32x4_t sum1; - highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0, - &sum1); - - return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), - vqrshrun_n_s32(sum1, FILTER_BITS)); -} - -static void highbd_convolve_horiz_neon(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, - const int16_t *x_filter_ptr, - int x_step_q4, int w, int h, int bd) { +static void highbd_convolve_horiz_8tap_neon( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { assert(w >= 4 && h >= 4); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { - int16x8_t s0, s1, s2, s3; - load_s16_8x2(s, src_stride, &s0, &s2); - load_s16_8x2(s + 8, src_stride, &s1, &s3); + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); - uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter); - uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter); + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], x_filter, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], x_filter, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], x_filter, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], x_filter, max); - uint16x8_t d01 = vcombine_u16(d0, d1); - d01 = vminq_u16(d01, max); + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); - vst1_u16(d + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(d + 1 * dst_stride, vget_high_u16(d01)); - - s += 2 * src_stride; - d += 2 * dst_stride; - h -= 2; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; } while (h > 0); } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; - int x_q4 = 0; - - const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS]; - int16x8_t s0, s2, s4, s6; - load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6); - src_x += 8; do { - int16x8_t s1, s3, s5, s7; - load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7); + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); - uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter); - uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter); - uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter); - uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter); - - d0 = vminq_u16(d0, max); - d1 = vminq_u16(d1, max); - d2 = vminq_u16(d2, max); - d3 = vminq_u16(d3, max); + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], x_filter, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], x_filter, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], x_filter, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], x_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); - s0 = s1; - s2 = s3; - s4 = s5; - s6 = s7; - src_x += 8; + s += 8; d += 8; width -= 8; - x_q4 += 8 * x_step_q4; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +static void highbd_convolve_horiz_4tap_neon( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { + assert(w >= 4 && h >= 4); + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = + highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], x_filter, max); + uint16x4_t d1 = + highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], x_filter, max); + uint16x4_t d2 = + highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], x_filter, max); + uint16x4_t d3 = + highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], x_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = + highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], x_filter, max); + uint16x8_t d1 = + highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], x_filter, max); + uint16x8_t d2 = + highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], x_filter, max); + uint16x8_t d3 = + highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], x_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; } while (width > 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; @@ -236,21 +245,30 @@ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; - highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, w, h, bd); + + const int filter_taps = get_filter_taps_convolve8(filter_x); + + if (filter_taps == 2) { + highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, + filter_x, w, h, bd); + } else if (filter_taps == 4) { + highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, + filter_x, w, h, bd); + } else { + highbd_convolve_horiz_8tap_neon(src, src_stride, dst, dst_stride, + filter_x, w, h, bd); + } } } -static void highbd_convolve_vert_neon(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, - const int16_t *y_filter_ptr, int w, int h, - int bd) { +static void highbd_convolve_vert_8tap_neon( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) { assert(w >= 4 && h >= 4); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; @@ -263,24 +281,15 @@ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = - highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); uint16x4_t d1 = - highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); uint16x4_t d2 = - highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); uint16x4_t d3 = - highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); - uint16x8_t d01 = vcombine_u16(d0, d1); - uint16x8_t d23 = vcombine_u16(d2, d3); - - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - - vst1_u16(d + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(d + 1 * dst_stride, vget_high_u16(d01)); - vst1_u16(d + 2 * dst_stride, vget_low_u16(d23)); - vst1_u16(d + 3 * dst_stride, vget_high_u16(d23)); + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -289,11 +298,14 @@ s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 0); } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + do { int height = h; const int16_t *s = (const int16_t *)src_ptr; @@ -307,19 +319,14 @@ int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); - uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, - s7, y_filter); - uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, - s8, y_filter); - uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, - s9, y_filter); - uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, - s10, y_filter); - - d0 = vminq_u16(d0, max); - d1 = vminq_u16(d1, max); - d2 = vminq_u16(d2, max); - d3 = vminq_u16(d3, max); + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); @@ -330,6 +337,7 @@ s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; d += 4 * dst_stride; height -= 4; @@ -357,7 +365,18 @@ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= (SUBPEL_TAPS / 2 - 1) * src_stride; - highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h, - bd); + + const int filter_taps = get_filter_taps_convolve8(filter_y); + + if (filter_taps == 2) { + highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, + dst_stride, filter_y, w, h, bd); + } else if (filter_taps == 4) { + highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, + dst_stride, filter_y, w, h, bd); + } else { + highbd_convolve_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, + w, h, bd); + } } }
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h new file mode 100644 index 0000000..b87b4ba --- /dev/null +++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -0,0 +1,279 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ +#define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ + +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE void highbd_convolve8_horiz_2tap_neon( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { + // Bilinear filter values are all positive and multiples of 8. Divide by 8 to + // reduce intermediate precision requirements and allow the use of non + // widening multiply. + const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8); + const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8); + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + uint16x8_t s0 = + load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride); + uint16x8_t s1 = + load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride); + uint16x8_t s2 = + load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride); + uint16x8_t s3 = + load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride); + + uint16x8_t sum01 = vmulq_u16(s0, f0); + sum01 = vmlaq_u16(sum01, s1, f1); + uint16x8_t sum23 = vmulq_u16(s2, f0); + sum23 = vmlaq_u16(sum23, s3, f1); + + // We divided filter taps by 8 so subtract 3 from right shift. + sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); + sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); + + sum01 = vminq_u16(sum01, max); + sum23 = vminq_u16(sum23, max); + + store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01); + store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + do { + int width = w; + const uint16_t *s = src_ptr; + uint16_t *d = dst_ptr; + + do { + uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0); + uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1); + uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0); + uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1); + + uint16x8_t sum01 = vmulq_u16(s0, f0); + sum01 = vmlaq_u16(sum01, s1, f1); + uint16x8_t sum23 = vmulq_u16(s2, f0); + sum23 = vmlaq_u16(sum23, s3, f1); + + // We divided filter taps by 8 so subtract 3 from right shift. + sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); + sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); + + sum01 = vminq_u16(sum01, max); + sum23 = vminq_u16(sum23, max); + + vst1q_u16(d + 0 * dst_stride, sum01); + vst1q_u16(d + 1 * dst_stride, sum23); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 2 * src_stride; + dst_ptr += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) { + int32x4_t sum = vmull_lane_s16(s0, filter, 0); + sum = vmlal_lane_s16(sum, s1, filter, 1); + sum = vmlal_lane_s16(sum, s2, filter, 2); + sum = vmlal_lane_s16(sum, s3, filter, 3); + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) { + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_vert_4tap_neon( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) { + assert(w >= 4 && h >= 4); + const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, max); + uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, max); + uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, max); + uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, max); + uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, max); + uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, max); + uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } +} + +static INLINE void highbd_convolve8_vert_2tap_neon( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { + // Bilinear filter values are all positive and multiples of 8. Divide by 8 to + // reduce intermediate precision requirements and allow the use of non + // widening multiply. + const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8); + const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8); + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + uint16x8_t s0 = + load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride); + uint16x8_t s1 = + load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride); + uint16x8_t s2 = + load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride); + uint16x8_t s3 = + load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride); + + uint16x8_t sum01 = vmulq_u16(s0, f0); + sum01 = vmlaq_u16(sum01, s1, f1); + uint16x8_t sum23 = vmulq_u16(s2, f0); + sum23 = vmlaq_u16(sum23, s3, f1); + + // We divided filter taps by 8 so subtract 3 from right shift. + sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); + sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); + + sum01 = vminq_u16(sum01, max); + sum23 = vminq_u16(sum23, max); + + store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01); + store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + do { + int width = w; + const uint16_t *s = src_ptr; + uint16_t *d = dst_ptr; + + do { + uint16x8_t s0, s1, s2; + load_u16_8x3(s, src_stride, &s0, &s1, &s2); + + uint16x8_t sum01 = vmulq_u16(s0, f0); + sum01 = vmlaq_u16(sum01, s1, f1); + uint16x8_t sum23 = vmulq_u16(s1, f0); + sum23 = vmlaq_u16(sum23, s2, f1); + + // We divided filter taps by 8 so subtract 3 from right shift. + sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); + sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); + + sum01 = vminq_u16(sum01, max); + sum23 = vminq_u16(sum23, max); + + vst1q_u16(d + 0 * dst_stride, sum01); + vst1q_u16(d + 1 * dst_stride, sum23); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 2 * src_stride; + dst_ptr += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} + +#endif // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c new file mode 100644 index 0000000..f519395 --- /dev/null +++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -0,0 +1,571 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/aom_filter.h" +#include "aom_dsp/arm/highbd_convolve8_neon.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter, + uint16x4_t max) { + int64x2_t sum[4]; + + sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter, + uint16x8_t max) { + int64x2_t sum[8]; + + sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); + sum[4] = aom_sdotq_s16(vdupq_n_s64(0), s[4], filter); + sum[5] = aom_sdotq_s16(vdupq_n_s64(0), s[5], filter); + sum[6] = aom_sdotq_s16(vdupq_n_s64(0), s[6], filter); + sum[7] = aom_sdotq_s16(vdupq_n_s64(0), s[7], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); + int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_horiz_8tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height, + int bd) { + const int16x8_t filter = vld1q_s16(filter_x); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4_h(s0, filter, max); + uint16x4_t d1 = highbd_convolve8_4_h(s1, filter, max); + uint16x4_t d2 = highbd_convolve8_4_h(s2, filter, max); + uint16x4_t d3 = highbd_convolve8_4_h(s3, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + do { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_h(s0, filter, max); + uint16x8_t d1 = highbd_convolve8_8_h(s1, filter, max); + uint16x8_t d2 = highbd_convolve8_8_h(s2, filter, max); + uint16x8_t d3 = highbd_convolve8_8_h(s3, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[16]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, +}; + +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; +// clang-format on + +static INLINE uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter, + uint16x8x2_t permute_tbl, + uint16x4_t max) { + int16x8_t permuted_samples0 = aom_tbl_s16(s, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s, permute_tbl.val[1]); + + int64x2_t sum0 = + aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples0, filter, 0); + int64x2_t sum1 = + aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples1, filter, 0); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter, + uint16x8_t idx, uint16x8_t max) { + int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + res = aom_tbl_u16(res, idx); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_horiz_4tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height, + int bd) { + const int16x8_t filter = vcombine_s16(vld1_s16(filter_x + 2), vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = highbd_convolve4_4_h(s0, filter, permute_tbl, max); + uint16x4_t d1 = highbd_convolve4_4_h(s1, filter, permute_tbl, max); + uint16x4_t d2 = highbd_convolve4_4_h(s2, filter, permute_tbl, max); + uint16x4_t d3 = highbd_convolve4_4_h(s3, filter, permute_tbl, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8_h(s0, filter, idx, max); + uint16x8_t d1 = highbd_convolve4_8_h(s1, filter, idx, max); + uint16x8_t d2 = highbd_convolve4_8_h(s2, filter, idx, max); + uint16x8_t d3 = highbd_convolve4_8_h(s3, filter, idx, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int width, int height, int bd) { + assert(x_step_q4 == 16); + assert(width >= 4 && height >= 4); + (void)filter_y; + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= SUBPEL_TAPS / 2 - 1; + + const int filter_taps = get_filter_taps_convolve8(filter_x); + + if (filter_taps == 2) { + highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, + filter_x, width, height, bd); + } else if (filter_taps == 4) { + highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride, + filter_x, width, height, bd); + } else { + highbd_convolve8_horiz_8tap_sve(src, src_stride, dst, dst_stride, filter_x, + width, height, bd); + } +} + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25, + // Shift left and insert two new columns in transposed 4x4 block. + 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27, + // Shift left and insert three new columns in transposed 4x4 block. + 6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29 +}; + +static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, + int16x4_t s2, int16x4_t s3, + int16x8_t res[2]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); + + int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); + int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); + + int32x4x2_t s0123 = vzipq_s32(s01, s23); + + res[0] = vreinterpretq_s16_s32(s0123.val[0]); + res[1] = vreinterpretq_s16_s32(s0123.val[1]); +} + +static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t s3, + int16x8_t res[4]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res_lo[0]: 00 10 20 30 01 11 21 31 + // res_lo[1]: 02 12 22 32 03 13 23 33 + // res_hi[0]: 04 14 24 34 05 15 25 35 + // res_hi[1]: 06 16 26 36 07 17 27 37 + + int16x8x2_t tr01_16 = vzipq_s16(s0, s1); + int16x8x2_t tr23_16 = vzipq_s16(s2, s3); + + int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]), + vreinterpretq_s32_s16(tr23_16.val[0])); + int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]), + vreinterpretq_s32_s16(tr23_16.val[1])); + + res[0] = vreinterpretq_s16_s32(tr01_32.val[0]); + res[1] = vreinterpretq_s16_s32(tr01_32.val[1]); + res[2] = vreinterpretq_s16_s32(tr23_32.val[0]); + res[3] = vreinterpretq_s16_s32(tr23_32.val[1]); +} + +static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], + uint8x16_t tbl, int16x8_t res[4]) { + int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), + vreinterpretq_s8_s16(t1[0]) }; + int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), + vreinterpretq_s8_s16(t1[1]) }; + int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]), + vreinterpretq_s8_s16(t1[2]) }; + int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]), + vreinterpretq_s8_s16(t1[3]) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); + res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl)); + res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl)); +} + +static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], + uint8x16_t tbl, int16x8_t res[2]) { + int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), + vreinterpretq_s8_s16(t1[0]) }; + int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), + vreinterpretq_s8_s16(t1[1]) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); +} + +static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum[2]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum[4]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); + + sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); + sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1); + + sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); + sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3])); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_vert_8tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height, + int bd) { + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint8x16_t merge_block_tbl[3]; + merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl); + merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16); + merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s78910[2]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s78910); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567); + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678); + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s78910[0]; + s3456[1] = s78910[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s78910[4]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s78910); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567); + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678); + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + + s3456[0] = s78910[0]; + s3456[1] = s78910[1]; + s3456[2] = s78910[2]; + s3456[3] = s78910[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int width, int height, int bd) { + assert(y_step_q4 == 16); + assert(width >= 4 && height >= 4); + (void)filter_x; + (void)y_step_q4; + (void)x_step_q4; + + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride; + + const int filter_taps = get_filter_taps_convolve8(filter_y); + + if (filter_taps == 2) { + highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, + dst_stride, filter_y, width, height, bd); + } else if (filter_taps == 4) { + highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, + dst_stride, filter_y, width, height, bd); + } else { + highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y, + width, height, bd); + } +}
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c index 366ca3f..eff773b 100644 --- a/aom_dsp/arm/highbd_intrapred_neon.c +++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -13,9 +13,11 @@ #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- @@ -1199,7 +1201,7 @@ // For width 16 and above. #define HIGHBD_SMOOTH_H_PREDICTOR(W) \ - void highbd_smooth_h_##W##xh_neon( \ + static void highbd_smooth_h_##W##xh_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t top_right = top_row[(W)-1]; \ @@ -1265,3 +1267,1516 @@ HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) #undef HIGHBD_SMOOTH_H_NXM_WIDE + +// ----------------------------------------------------------------------------- +// Z1 + +static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; +static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 }; + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0, + uint16x4_t a1, + int shift) { + // The C implementation of the z1 predictor uses (32 - shift) and a right + // shift by 5, however we instead double shift to avoid an unnecessary right + // shift by 1. + uint32x4_t res = vmull_n_u16(a1, shift); + res = vmlal_n_u16(res, a0, 64 - shift); + return vrshrn_n_u32(res, 6); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0, + uint16x8_t a1, + int shift) { + return vcombine_u16( + highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift), + highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift)); +} + +// clang-format off +static const uint8_t kLoadMaxShuffles[] = { + 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, + 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, + 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; +// clang-format on + +static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr, + int shuffle_idx) { + uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]); + uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr)); +#if AOM_ARCH_AARCH64 + return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle)); +#else + uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } }; + uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle)); + uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle)); + return vreinterpretq_u16_u8(vcombine_u8(lo, hi)); +#endif +} + +static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, + const uint16_t *above, + int dx) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx > 0); + + const int max_base_x = (bw + bh) - 1; + const int above_max = above[max_base_x]; + + const int16x8_t iota1x8 = vld1q_s16(iota1_s16); + const int16x4_t iota1x4 = vget_low_s16(iota1x8); + + int x = dx; + int r = 0; + do { + const int base = x >> 6; + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above_max, bw); + dst += stride; + } + return; + } + + // The C implementation of the z1 predictor when not upsampling uses: + // ((x & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const int shift = x & 0x3e; + + if (bw == 4) { + const uint16x4_t a0 = vld1_u16(&above[base]); + const uint16x4_t a1 = vld1_u16(&above[base + 1]); + const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift); + const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4); + const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); + vst1_u16(dst, res); + } else { + int c = 0; + do { + uint16x8_t a0; + uint16x8_t a1; + if (base + c >= max_base_x) { + a0 = a1 = vdupq_n_u16(above_max); + } else { + if (base + c + 7 >= max_base_x) { + int shuffle_idx = max_base_x - base - c; + a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx); + } else { + a0 = vld1q_u16(above + base + c); + } + if (base + c + 8 >= max_base_x) { + int shuffle_idx = max_base_x - base - c - 1; + a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx); + } else { + a1 = vld1q_u16(above + base + c + 1); + } + } + + vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift)); + c += 8; + } while (c < bw); + } + + dst += stride; + x += dx; + } while (++r < bh); +} + +static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, + const uint16_t *above, + int dx) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << 1; + const int above_max = above[max_base_x]; + + const int16x8_t iota2x8 = vld1q_s16(iota2_s16); + const int16x4_t iota2x4 = vget_low_s16(iota2x8); + + int x = dx; + int r = 0; + do { + const int base = x >> 5; + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above_max, bw); + dst += stride; + } + return; + } + + // The C implementation of the z1 predictor when upsampling uses: + // (((x << 1) & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const int shift = (x << 1) & 0x3e; + + if (bw == 4) { + const uint16x4x2_t a01 = vld2_u16(&above[base]); + const uint16x4_t val = + highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift); + const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4); + const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); + vst1_u16(dst, res); + } else { + int c = 0; + do { + const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]); + const uint16x8_t val = + highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift); + const uint16x8_t cmp = + vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8); + const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); + vst1q_u16(dst + c, res); + c += 8; + } while (c < bw); + } + + dst += stride; + x += dx; + } while (++r < bh); +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + (void)left; + (void)dy; + (void)bd; + assert(dy == 1); + + if (upsample_above) { + highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx); + } else { + highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx); + } +} + +// ----------------------------------------------------------------------------- +// Z2 + +#if AOM_ARCH_AARCH64 +// Incrementally shift more elements from `above` into the result, merging with +// existing `left` elements. +// X0, X1, X2, X3 +// Y0, X0, X1, X2 +// Y0, Y1, X0, X1 +// Y0, Y1, Y2, X0 +// Y0, Y1, Y2, Y3 +// clang-format off +static const uint8_t z2_merge_shuffles_u16x4[5][8] = { + { 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 2, 3, 8, 9, 10, 11 }, + { 0, 1, 2, 3, 4, 5, 8, 9 }, + { 0, 1, 2, 3, 4, 5, 6, 7 }, +}; +// clang-format on + +// Incrementally shift more elements from `above` into the result, merging with +// existing `left` elements. +// X0, X1, X2, X3, X4, X5, X6, X7 +// Y0, X0, X1, X2, X3, X4, X5, X6 +// Y0, Y1, X0, X1, X2, X3, X4, X5 +// Y0, Y1, Y2, X0, X1, X2, X3, X4 +// Y0, Y1, Y2, Y3, X0, X1, X2, X3 +// Y0, Y1, Y2, Y3, Y4, X0, X1, X2 +// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1 +// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0 +// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 +// clang-format off +static const uint8_t z2_merge_shuffles_u16x8[9][16] = { + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, + { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }, + { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, +}; +// clang-format on + +// clang-format off +static const uint16_t z2_y_iter_masks_u16x4[5][4] = { + { 0U, 0U, 0U, 0U }, + { 0xffffU, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU }, +}; +// clang-format on + +// clang-format off +static const uint16_t z2_y_iter_masks_u16x8[9][8] = { + { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU }, +}; +// clang-format on + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8( + const uint16x8_t left_data, const int16x4_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x8_t left_indices = + vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); + left_indices = vtrn1_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); + const uint16x4_t ret = vreinterpret_u16_u8( + vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices)); + return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16( + const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x8_t left_indices = + vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); + left_indices = vtrn1_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); + uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), + vreinterpretq_u8_u16(left_data.val[1]) } }; + const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices)); + return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8( + const uint16x8_t left_data, const int16x8_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x16_t left_indices = + vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); + left_indices = vtrn1q_u8(left_indices, left_indices); + left_indices = vaddq_u8(left_indices, left_indices); + left_indices = + vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); + const uint16x8_t ret = vreinterpretq_u16_u8( + vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices)); + return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16( + const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x16_t left_indices = + vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); + left_indices = vtrn1q_u8(left_indices, left_indices); + left_indices = vaddq_u8(left_indices, left_indices); + left_indices = + vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); + uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), + vreinterpretq_u8_u16(left_data.val[1]) } }; + const uint16x8_t ret = + vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices)); + return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); +} +#endif // AOM_ARCH_AARCH64 + +static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4( + const uint16_t *left, const int16x4_t indices, int n) { + assert(n > 0); + assert(n <= 4); + // Load two elements at a time and then uzp them into separate vectors, to + // reduce the number of memory accesses. + uint32x2_t ret0_u32 = vdup_n_u32(0); + uint32x2_t ret1_u32 = vdup_n_u32(0); + + // Use a single vget_lane_u64 to minimize vector to general purpose register + // transfers and then mask off the bits we actually want. + const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0); + const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); + const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); + const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); + const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); + + // At time of writing both Clang and GCC produced better code with these + // nested if-statements compared to a switch statement with fallthrough. + ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0); + if (n > 1) { + ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1); + if (n > 2) { + ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0); + if (n > 3) { + ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1); + } + } + } + return vuzp_u16(vreinterpret_u16_u32(ret0_u32), + vreinterpret_u16_u32(ret1_u32)); +} + +static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8( + const uint16_t *left, const int16x8_t indices, int n) { + assert(n > 0); + assert(n <= 8); + // Load two elements at a time and then uzp them into separate vectors, to + // reduce the number of memory accesses. + uint32x4_t ret0_u32 = vdupq_n_u32(0); + uint32x4_t ret1_u32 = vdupq_n_u32(0); + + // Use a pair of vget_lane_u64 to minimize vector to general purpose register + // transfers and then mask off the bits we actually want. + const uint64_t indices0123 = + vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0); + const uint64_t indices4567 = + vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1); + const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); + const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); + const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); + const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); + const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU); + const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU); + const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU); + const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU); + + // At time of writing both Clang and GCC produced better code with these + // nested if-statements compared to a switch statement with fallthrough. + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0); + if (n > 1) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1); + if (n > 2) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2); + if (n > 3) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3); + if (n > 4) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0); + if (n > 5) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1); + if (n > 6) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2); + if (n > 7) { + ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7), + ret1_u32, 3); + } + } + } + } + } + } + } + return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32), + vreinterpretq_u16_u32(ret1_u32)); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4( + uint16x4_t out_x, uint16x4_t out_y, int base_shift) { + assert(base_shift >= 0); + assert(base_shift <= 4); + // On AArch64 we can permute the data from the `above` and `left` vectors + // into a single vector in a single load (of the permute vector) + tbl. +#if AOM_ARCH_AARCH64 + const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y), + vreinterpret_u8_u16(out_x) } }; + return vreinterpret_u16_u8( + vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift]))); +#else + uint16x4_t out = out_y; + for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) { + out[c2] = out_x[x_idx]; + } + return out; +#endif +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8( + uint16x8_t out_x, uint16x8_t out_y, int base_shift) { + assert(base_shift >= 0); + assert(base_shift <= 8); + // On AArch64 we can permute the data from the `above` and `left` vectors + // into a single vector in a single load (of the permute vector) + tbl. +#if AOM_ARCH_AARCH64 + const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y), + vreinterpretq_u8_u16(out_x) } }; + return vreinterpretq_u16_u8( + vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift]))); +#else + uint16x8_t out = out_y; + for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) { + out[c2] = out_x[x_idx]; + } + return out; +#endif +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4( + uint16x4_t a0, uint16x4_t a1, int16x4_t shift) { + uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift)); + res = + vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift))); + return vrshrn_n_u32(res, 5); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8( + uint16x8_t a0, uint16x8_t a1, int16x8_t shift) { + return vcombine_u16( + highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), + vget_low_s16(shift)), + highbd_dr_prediction_z2_apply_shift_x4( + vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift))); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4( + const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1, + const uint16_t *left, int dx, int dy, int r, int c) { + const int16x4_t iota = vld1_s16(iota1_s16); + + const int x0 = (c << 6) - (r + 1) * dx; + const int y0 = (r << 6) - (c + 1) * dy; + + const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6)); + const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy)); + const int16x4_t shift_x0123 = + vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); + const int16x4_t shift_y0123 = + vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1); + const int16x4_t base_y0123 = vshr_n_s16(y0123, 6); + + const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; + + // Based on the value of `base_shift` there are three possible cases to + // compute the result: + // 1) base_shift <= 0: We can load and operate entirely on data from the + // `above` input vector. + // 2) base_shift < vl: We can load from `above[-1]` and shift + // `vl - base_shift` elements across to the end of the + // vector, then compute the remainder from `left`. + // 3) base_shift >= vl: We can load and operate entirely on data from the + // `left` input vector. + + if (base_shift <= 0) { + const int base_x = x0 >> 6; + const uint16x4_t a0 = vld1_u16(above + base_x); + const uint16x4_t a1 = vld1_u16(above + base_x + 1); + return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + } else if (base_shift < 4) { + const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4( + left + 1, base_y0123, base_shift); + const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4( + l01.val[0], l01.val[1], shift_y0123); + + // No need to reload from above in the loop, just use pre-loaded constants. + const uint16x4_t out16_x = + highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123); + + return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift); + } else { + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4); + return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1], + shift_y0123); + } +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8( + const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1, + const uint16_t *left, int dx, int dy, int r, int c) { + const int16x8_t iota = vld1q_s16(iota1_s16); + + const int x0 = (c << 6) - (r + 1) * dx; + const int y0 = (r << 6) - (c + 1) * dy; + + const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6)); + const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy)); + const int16x8_t shift_x01234567 = + vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1); + const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6); + + const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; + + // Based on the value of `base_shift` there are three possible cases to + // compute the result: + // 1) base_shift <= 0: We can load and operate entirely on data from the + // `above` input vector. + // 2) base_shift < vl: We can load from `above[-1]` and shift + // `vl - base_shift` elements across to the end of the + // vector, then compute the remainder from `left`. + // 3) base_shift >= vl: We can load and operate entirely on data from the + // `left` input vector. + + if (base_shift <= 0) { + const int base_x = x0 >> 6; + const uint16x8_t a0 = vld1q_u16(above + base_x); + const uint16x8_t a1 = vld1q_u16(above + base_x + 1); + return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + } else if (base_shift < 8) { + const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8( + left + 1, base_y01234567, base_shift); + const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8( + l01.val[0], l01.val[1], shift_y01234567); + + // No need to reload from above in the loop, just use pre-loaded constants. + const uint16x8_t out16_x = + highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567); + + return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift); + } else { + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8); + return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1], + shift_y01234567); + } +} + +// Left array is accessed from -1 through `bh - 1` inclusive. +// Above array is accessed from -1 through `bw - 1` inclusive. +#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \ + static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int upsample_above, int upsample_left, int dx, \ + int dy, int bd) { \ + (void)bd; \ + (void)upsample_above; \ + (void)upsample_left; \ + assert(!upsample_above); \ + assert(!upsample_left); \ + assert(bw % 4 == 0); \ + assert(bh % 4 == 0); \ + assert(dx > 0); \ + assert(dy > 0); \ + \ + uint16_t left_data[bh + 1]; \ + memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \ + \ + uint16x8_t a0, a1; \ + if (bw == 4) { \ + a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \ + a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \ + } else { \ + a0 = vld1q_u16(above - 1); \ + a1 = vld1q_u16(above + 0); \ + } \ + \ + int r = 0; \ + do { \ + if (bw == 4) { \ + vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \ + above, vget_low_u16(a0), vget_low_u16(a1), \ + left_data, dx, dy, r, 0)); \ + } else { \ + int c = 0; \ + do { \ + vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \ + above, a0, a1, left_data, dx, dy, r, c)); \ + c += 8; \ + } while (c < bw); \ + } \ + dst += stride; \ + } while (++r < bh); \ + } + +HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) + +#undef HIGHBD_DR_PREDICTOR_Z2_WXH + +typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd); + +static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 6 inclusive from `left`. + // else we only need -1 through 3 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16(left - 2); + left_data1 = vld1q_u16(left - 1); + } else { + left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); + left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); + } +#endif + + const int16x4_t iota0123 = vld1_s16(iota1_s16); + const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); + + for (int r = 0; r < 4; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above + base_x0); + a1 = vld1_u16(above + base_x0 + 1); + shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); + } + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); + } else if (base_shift < 4) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, + left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, + left_data_base, y_iters); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x4_t out_y = + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); + + // Calculate X component from `above`. + const int16x4_t shift_x0123 = vshr_n_s16( + vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)), + 1); + uint16x4_t a0, a1; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + } else { + a0 = vld1_u16(above - 1); + a1 = vld1_u16(above + 0); + } + const uint16x4_t out_x = + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + + // Combine X and Y vectors. + const uint16x4_t out = + highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); + vst1_u16(dst, out); + } else { + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, + left_data_base, 4); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, + left_data_base, 4); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 14 inclusive from `left`. + // else we only need -1 through 6 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8x2_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16_x2(left - 2); + left_data1 = vld1q_u16_x2(left - 1); + } else { + left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; + left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; + } +#endif + + const int16x4_t iota0123 = vld1_s16(iota1_s16); + const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); + + for (int r = 0; r < 8; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above + base_x0); + a1 = vld1_u16(above + base_x0 + 1); + shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); + } + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); + } else if (base_shift < 4) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( + left_data0, base_y0123, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( + left_data1, base_y0123, left_data_base, y_iters); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x4_t out_y = + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); + + // Calculate X component from `above`. + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above - 1); + a1 = vld1_u16(above + 0); + shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); + } + const uint16x4_t out_x = + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + + // Combine X and Y vectors. + const uint16x4_t out = + highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); + vst1_u16(dst, out); + } else { + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123, + left_data_base, 4); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123, + left_data_base, 4); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 6 inclusive from `left`. + // else we only need -1 through 3 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16(left - 2); + left_data1 = vld1q_u16(left - 1); + } else { + left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); + left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); + } +#endif + + const int16x8_t iota01234567 = vld1q_s16(iota1_s16); + const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); + + for (int r = 0; r < 4; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x8_t x01234567 = + vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = vld2q_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above + base_x0); + a1 = vld1q_u16(above + base_x0 + 1); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); + } else if (base_shift < 8) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data0, base_y01234567, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data1, base_y01234567, left_data_base, y_iters); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x8_t out_y = + highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); + + // Calculate X component from `above`. + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = + vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above - 1); + a1 = vld1q_u16(above + 0); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + const uint16x8_t out_x = + highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + + // Combine X and Y vectors. + const uint16x8_t out = + highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); + vst1q_u16(dst, out); + } else { + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data0, base_y01234567, left_data_base, 8); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data1, base_y01234567, left_data_base, 8); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 14 inclusive from `left`. + // else we only need -1 through 6 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8x2_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16_x2(left - 2); + left_data1 = vld1q_u16_x2(left - 1); + } else { + left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; + left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; + } +#endif + + const int16x8_t iota01234567 = vld1q_s16(iota1_s16); + const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); + + for (int r = 0; r < 8; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x8_t x01234567 = + vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = vld2q_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above + base_x0); + a1 = vld1q_u16(above + base_x0 + 1); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); + } else if (base_shift < 8) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data0, base_y01234567, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data1, base_y01234567, left_data_base, y_iters); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x8_t out_y = + highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); + + // Calculate X component from `above`. + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = + vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above - 1); + a1 = vld1q_u16(above + 0); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + const uint16x8_t out_x = + highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + + // Combine X and Y vectors. + const uint16x8_t out = + highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); + vst1q_u16(dst, out); + } else { + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data0, base_y01234567, left_data_base, 8); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data1, base_y01234567, left_data_base, 8); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); + } + dst += stride; + } +} + +static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, + &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL, + NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, + &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, + &highbd_dr_prediction_z2_8x32_neon, NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon, + &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon, + &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon }, + { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon, + &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon, + &highbd_dr_prediction_z2_32x64_neon }, + { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon, + &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon }, +}; + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + highbd_dr_prediction_z2_ptr f = + dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)]; + assert(f != NULL); + f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd); +} + +// ----------------------------------------------------------------------------- +// Z3 + +// Both the lane to the use and the shift amount must be immediates. +#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \ + lane, shift) \ + do { \ + uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \ + val = vmlal_lane_u16(val, (in1), (s1), (lane)); \ + const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \ + const uint16x4_t res = vrshrn_n_u32(val, (shift)); \ + *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \ + vdup_n_u16(left_max)); \ + } while (0) + +#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \ + lane, shift) \ + do { \ + uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \ + val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \ + uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \ + val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \ + *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \ + vrshrn_n_u32(val_hi, (shift))); \ + } while (0) + +static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs, + int max_ofs) { + uint16x8_t r0; + uint16x8_t r1; + if (ofs + 7 >= max_ofs) { + int shuffle_idx = max_ofs - ofs; + r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx); + } else { + r0 = vld1q_u16(left0 + ofs); + } + if (ofs + 8 >= max_ofs) { + int shuffle_idx = max_ofs - ofs - 1; + r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx); + } else { + r1 = vld1q_u16(left0 + ofs + 1); + } + return (uint16x8x2_t){ { r0, r1 } }; +} + +static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, const uint16_t *left, + int dy) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dy > 0); + + // Factor out left + 1 to give the compiler a better chance of recognising + // that the offsets used for the loads from left and left + 1 are otherwise + // identical. + const uint16_t *left1 = left + 1; + + const int max_base_y = (bw + bh - 1); + const int left_max = left[max_base_y]; + const int frac_bits = 6; + + const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16)); + const uint16x4_t iota1x4 = vget_low_u16(iota1x8); + + // The C implementation of the z3 predictor when not upsampling uses: + // ((y & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const uint16x4_t shift_mask = vdup_n_u16(0x3e); + + if (bh == 4) { + int y = dy; + int c = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); + const int base0 = (y + 0 * dy) >> frac_bits; + const int base1 = (y + 1 * dy) >> frac_bits; + const int base2 = (y + 2 * dy) >> frac_bits; + const int base3 = (y + 3 * dy) >> frac_bits; + uint16x4_t out[4]; + if (base0 >= max_base_y) { + out[0] = vdup_n_u16(left_max); + } else { + const uint16x4_t l00 = vld1_u16(left + base0); + const uint16x4_t l01 = vld1_u16(left1 + base0); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01, + shifts0, shifts1, 0, 6); + } + if (base1 >= max_base_y) { + out[1] = vdup_n_u16(left_max); + } else { + const uint16x4_t l10 = vld1_u16(left + base1); + const uint16x4_t l11 = vld1_u16(left1 + base1); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11, + shifts0, shifts1, 1, 6); + } + if (base2 >= max_base_y) { + out[2] = vdup_n_u16(left_max); + } else { + const uint16x4_t l20 = vld1_u16(left + base2); + const uint16x4_t l21 = vld1_u16(left1 + base2); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21, + shifts0, shifts1, 2, 6); + } + if (base3 >= max_base_y) { + out[3] = vdup_n_u16(left_max); + } else { + const uint16x4_t l30 = vld1_u16(left + base3); + const uint16x4_t l31 = vld1_u16(left1 + base3); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31, + shifts0, shifts1, 3, 6); + } + transpose_array_inplace_u16_4x4(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + r2 * stride + c, out[r2]); + } + y += 4 * dy; + c += 4; + } while (c < bw); + } else { + int y = dy; + int c = 0; + do { + int r = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); + const int base0 = ((y + 0 * dy) >> frac_bits) + r; + const int base1 = ((y + 1 * dy) >> frac_bits) + r; + const int base2 = ((y + 2 * dy) >> frac_bits) + r; + const int base3 = ((y + 3 * dy) >> frac_bits) + r; + uint16x8_t out[4]; + if (base0 >= max_base_y) { + out[0] = vdupq_n_u16(left_max); + } else { + const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 6); + } + if (base1 >= max_base_y) { + out[1] = vdupq_n_u16(left_max); + } else { + const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 6); + } + if (base2 >= max_base_y) { + out[2] = vdupq_n_u16(left_max); + } else { + const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 6); + } + if (base3 >= max_base_y) { + out[3] = vdupq_n_u16(left_max); + } else { + const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 6); + } + transpose_array_inplace_u16_4x8(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); + } + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); + } + r += 8; + } while (r < bh); + y += 4 * dy; + c += 4; + } while (c < bw); + } +} + +static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, const uint16_t *left, + int dy) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << 1; + const int left_max = left[max_base_y]; + const int frac_bits = 5; + + const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16)); + const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16)); + const uint16x4_t iota2x4 = vget_low_u16(iota2x8); + + // The C implementation of the z3 predictor when upsampling uses: + // (((x << 1) & 0x3f) >> 1) + // The two shifts are unnecessary here since the lowest bit is guaranteed to + // be zero when the mask is applied, so adjust the mask to 0x1f to avoid + // needing the shifts at all. + const uint16x4_t shift_mask = vdup_n_u16(0x1F); + + if (bh == 4) { + int y = dy; + int c = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); + const int base0 = (y + 0 * dy) >> frac_bits; + const int base1 = (y + 1 * dy) >> frac_bits; + const int base2 = (y + 2 * dy) >> frac_bits; + const int base3 = (y + 3 * dy) >> frac_bits; + const uint16x4x2_t l0 = vld2_u16(left + base0); + const uint16x4x2_t l1 = vld2_u16(left + base1); + const uint16x4x2_t l2 = vld2_u16(left + base2); + const uint16x4x2_t l3 = vld2_u16(left + base3); + uint16x4_t out[4]; + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 5); + transpose_array_inplace_u16_4x4(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + r2 * stride + c, out[r2]); + } + y += 4 * dy; + c += 4; + } while (c < bw); + } else { + assert(bh % 8 == 0); + + int y = dy; + int c = 0; + do { + int r = 0; + do { + // Fully unroll the 4x8 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); + const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2); + const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2); + const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2); + const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2); + const uint16x8x2_t l0 = vld2q_u16(left + base0); + const uint16x8x2_t l1 = vld2q_u16(left + base1); + const uint16x8x2_t l2 = vld2q_u16(left + base2); + const uint16x8x2_t l3 = vld2q_u16(left + base3); + uint16x8_t out[4]; + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 5); + transpose_array_inplace_u16_4x8(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); + } + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); + } + r += 8; + } while (r < bh); + y += 4 * dy; + c += 4; + } while (c < bw); + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + (void)above; + (void)dx; + (void)bd; + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx == 1); + assert(dy > 0); + + if (upsample_left) { + highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy); + } else { + highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy); + } +} + +#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4 +#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c index 6149c9f..b351429 100644 --- a/aom_dsp/arm/highbd_quantize_neon.c +++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -14,6 +14,7 @@ #include <string.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/quantize.h"
diff --git a/aom_dsp/arm/highbd_sse_sve.c b/aom_dsp/arm/highbd_sse_sve.c new file mode 100644 index 0000000..9ea13ab --- /dev/null +++ b/aom_dsp/arm/highbd_sse_sve.c
@@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint64x2_t *sse) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + + *sse = aom_udotq_u16(*sse, abs_diff, abs_diff); +} + +static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return vaddvq_u64(vaddq_u64(sse[0], sse[1])); +} + +static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]); + highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]); + + src += 2 * src_stride; + ref += 2 * ref_stride; + height -= 2; + } while (height != 0); + + return vaddvq_u64(vaddq_u64(sse[0], sse[1])); +} + +static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse = vdupq_n_u64(0); + + do { + uint16x8_t s = load_unaligned_u16_4x2(src, src_stride); + uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride); + + uint16x8_t abs_diff = vabdq_u16(s, r); + sse = aom_udotq_u16(sse, abs_diff, abs_diff); + + src += 2 * src_stride; + ref += 2 * ref_stride; + height -= 2; + } while (height != 0); + + return vaddvq_u64(sse); +} + +static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + svuint64_t sse = svdup_n_u64(0); + uint64_t step = svcnth(); + + do { + int w = 0; + const uint16_t *src_ptr = src; + const uint16_t *ref_ptr = ref; + + do { + svbool_t pred = svwhilelt_b16_u32(w, width); + svuint16_t s = svld1_u16(pred, src_ptr); + svuint16_t r = svld1_u16(pred, ref_ptr); + + svuint16_t abs_diff = svabd_u16_z(pred, s, r); + + sse = svdot_u64(sse, abs_diff, abs_diff); + + src_ptr += step; + ref_ptr += step; + w += step; + } while (w < width); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return svaddv_u64(svptrue_b64(), sse); +} + +int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height); + case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height); + case 128: + return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width, + height); + } +}
diff --git a/aom_dsp/arm/highbd_subpel_variance_neon.c b/aom_dsp/arm/highbd_subpel_variance_neon.c index bdbbf70..686fa5f 100644 --- a/aom_dsp/arm/highbd_subpel_variance_neon.c +++ b/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -184,40 +184,40 @@ \ if (xoffset == 0) { \ if (yoffset == 0) { \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ src_stride, h, yoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -225,21 +225,21 @@ if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ @@ -508,22 +508,22 @@ } while (--i != 0); } -#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ - uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + 1)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ - xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ @@ -538,19 +538,19 @@ if (yoffset == 0) { \ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp, source_stride, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp, source_stride, source_stride, h, yoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ @@ -559,7 +559,7 @@ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp0, source_stride, 1, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ @@ -567,7 +567,7 @@ (h + 1)); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ @@ -575,7 +575,7 @@ (h + 1)); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -584,7 +584,7 @@ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp0, source_stride, 1, h, xoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -592,7 +592,7 @@ (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -600,7 +600,7 @@ (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ @@ -714,25 +714,25 @@ HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) #endif // !CONFIG_REALTIME_ONLY -#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ - unsigned int \ - aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t tmp0[w * (h + 1)]; \ - uint16_t tmp1[w * (h + 1)]; \ - uint16_t tmp2[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ - xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ - h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ - msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ +#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * (h + 1)]; \ + uint16_t tmp2[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ + h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \ + w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ @@ -749,7 +749,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \ w, h, src, src_stride, msk, msk_stride, \ invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -758,7 +758,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -767,7 +767,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ @@ -778,7 +778,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -789,7 +789,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -800,7 +800,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } \ } else { \ @@ -812,7 +812,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ @@ -824,7 +824,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp0[w * (h + 1)]; \ @@ -836,7 +836,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } \ } \
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c index e54fc18..18b8eff 100644 --- a/aom_dsp/arm/highbd_variance_neon.c +++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -412,52 +412,34 @@ return *sse; } -static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, - int src_stride, - const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { - return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h, - sse); -} - -static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, - int src_stride, - const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { - return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h, - sse); -} - -#define HIGHBD_MSE_WXH_NEON(w, h) \ - uint32_t aom_highbd_8_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \ - return *sse; \ - } \ - \ - uint32_t aom_highbd_10_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ - return *sse; \ - } \ - \ - uint32_t aom_highbd_12_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ - return *sse; \ +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t aom_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ } HIGHBD_MSE_WXH_NEON(16, 16)
diff --git a/aom_dsp/arm/highbd_variance_sve.c b/aom_dsp/arm/highbd_variance_sve.c new file mode 100644 index 0000000..ad1f55e --- /dev/null +++ b/aom_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,421 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/variance.h" + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s64 = aom_sdotq_s16(sse_s64, diff, diff); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + h -= 2; + } while (h != 0); + + *sum = vaddlvq_s16(sum_s16); + *sse = vaddvq_s64(sse_s64); +} + +static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref, + int32x4_t *sum, int64x2_t *sse) { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + *sum = vpadalq_s16(*sum, diff); + + *sse = aom_sdotq_s16(*sse, diff, diff); +} + +static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = vaddlvq_s32(sum_s32); + *sse = vaddvq_s64(sse_s64); +} + +static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]); + variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1])); + *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); +} + +static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int j = 0; + do { + variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]); + variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1], + &sse_s64[1]); + variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2], + &sse_s64[2]); + variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3], + &sse_s64[3]); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); + *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2])); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); + *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2])); +} + +static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +static INLINE void highbd_variance_128xh_sve(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum); +} + +#define HBD_VARIANCE_WXH_8_SVE(w, h) \ + uint32_t aom_highbd_8_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_SVE(w, h) \ + uint32_t aom_highbd_10_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_SVE(w, h) \ + uint32_t aom_highbd_12_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_SVE(4, 4) +HBD_VARIANCE_WXH_8_SVE(4, 8) + +HBD_VARIANCE_WXH_8_SVE(8, 4) +HBD_VARIANCE_WXH_8_SVE(8, 8) +HBD_VARIANCE_WXH_8_SVE(8, 16) + +HBD_VARIANCE_WXH_8_SVE(16, 8) +HBD_VARIANCE_WXH_8_SVE(16, 16) +HBD_VARIANCE_WXH_8_SVE(16, 32) + +HBD_VARIANCE_WXH_8_SVE(32, 16) +HBD_VARIANCE_WXH_8_SVE(32, 32) +HBD_VARIANCE_WXH_8_SVE(32, 64) + +HBD_VARIANCE_WXH_8_SVE(64, 32) +HBD_VARIANCE_WXH_8_SVE(64, 64) +HBD_VARIANCE_WXH_8_SVE(64, 128) + +HBD_VARIANCE_WXH_8_SVE(128, 64) +HBD_VARIANCE_WXH_8_SVE(128, 128) + +// 10-bit +HBD_VARIANCE_WXH_10_SVE(4, 4) +HBD_VARIANCE_WXH_10_SVE(4, 8) + +HBD_VARIANCE_WXH_10_SVE(8, 4) +HBD_VARIANCE_WXH_10_SVE(8, 8) +HBD_VARIANCE_WXH_10_SVE(8, 16) + +HBD_VARIANCE_WXH_10_SVE(16, 8) +HBD_VARIANCE_WXH_10_SVE(16, 16) +HBD_VARIANCE_WXH_10_SVE(16, 32) + +HBD_VARIANCE_WXH_10_SVE(32, 16) +HBD_VARIANCE_WXH_10_SVE(32, 32) +HBD_VARIANCE_WXH_10_SVE(32, 64) + +HBD_VARIANCE_WXH_10_SVE(64, 32) +HBD_VARIANCE_WXH_10_SVE(64, 64) +HBD_VARIANCE_WXH_10_SVE(64, 128) + +HBD_VARIANCE_WXH_10_SVE(128, 64) +HBD_VARIANCE_WXH_10_SVE(128, 128) + +// 12-bit +HBD_VARIANCE_WXH_12_SVE(4, 4) +HBD_VARIANCE_WXH_12_SVE(4, 8) + +HBD_VARIANCE_WXH_12_SVE(8, 4) +HBD_VARIANCE_WXH_12_SVE(8, 8) +HBD_VARIANCE_WXH_12_SVE(8, 16) + +HBD_VARIANCE_WXH_12_SVE(16, 8) +HBD_VARIANCE_WXH_12_SVE(16, 16) +HBD_VARIANCE_WXH_12_SVE(16, 32) + +HBD_VARIANCE_WXH_12_SVE(32, 16) +HBD_VARIANCE_WXH_12_SVE(32, 32) +HBD_VARIANCE_WXH_12_SVE(32, 64) + +HBD_VARIANCE_WXH_12_SVE(64, 32) +HBD_VARIANCE_WXH_12_SVE(64, 64) +HBD_VARIANCE_WXH_12_SVE(64, 128) + +HBD_VARIANCE_WXH_12_SVE(128, 64) +HBD_VARIANCE_WXH_12_SVE(128, 128) + +#if !CONFIG_REALTIME_ONLY +// 8-bit +HBD_VARIANCE_WXH_8_SVE(4, 16) + +HBD_VARIANCE_WXH_8_SVE(8, 32) + +HBD_VARIANCE_WXH_8_SVE(16, 4) +HBD_VARIANCE_WXH_8_SVE(16, 64) + +HBD_VARIANCE_WXH_8_SVE(32, 8) + +HBD_VARIANCE_WXH_8_SVE(64, 16) + +// 10-bit +HBD_VARIANCE_WXH_10_SVE(4, 16) + +HBD_VARIANCE_WXH_10_SVE(8, 32) + +HBD_VARIANCE_WXH_10_SVE(16, 4) +HBD_VARIANCE_WXH_10_SVE(16, 64) + +HBD_VARIANCE_WXH_10_SVE(32, 8) + +HBD_VARIANCE_WXH_10_SVE(64, 16) + +// 12-bit +HBD_VARIANCE_WXH_12_SVE(4, 16) + +HBD_VARIANCE_WXH_12_SVE(8, 32) + +HBD_VARIANCE_WXH_12_SVE(16, 4) +HBD_VARIANCE_WXH_12_SVE(16, 64) + +HBD_VARIANCE_WXH_12_SVE(32, 8) + +HBD_VARIANCE_WXH_12_SVE(64, 16) + +#endif // !CONFIG_REALTIME_ONLY + +#undef HBD_VARIANCE_WXH_8_SVE +#undef HBD_VARIANCE_WXH_10_SVE +#undef HBD_VARIANCE_WXH_12_SVE + +static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + unsigned int *sse) { + uint64x2_t sse_u64 = vdupq_n_u64(0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u64 = aom_udotq_u16(sse_u64, diff, diff); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sse = (uint32_t)vaddvq_u64(sse_u64); + return *sse; +} + +#define HIGHBD_MSE_WXH_SVE(w, h) \ + uint32_t aom_highbd_10_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_SVE(16, 16) +HIGHBD_MSE_WXH_SVE(16, 8) +HIGHBD_MSE_WXH_SVE(8, 16) +HIGHBD_MSE_WXH_SVE(8, 8) + +#undef HIGHBD_MSE_WXH_SVE + +uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4)); + + uint64x2_t sum = vdupq_n_u64(0); + + if (w == 8) { + do { + uint16x8_t d0 = vld1q_u16(dst + 0 * dstride); + uint16x8_t d1 = vld1q_u16(dst + 1 * dstride); + uint16x8_t s0 = vld1q_u16(src + 0 * sstride); + uint16x8_t s1 = vld1q_u16(src + 1 * sstride); + + uint16x8_t abs_diff0 = vabdq_u16(s0, d0); + uint16x8_t abs_diff1 = vabdq_u16(s1, d1); + + sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); + sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); + + dst += 2 * dstride; + src += 2 * sstride; + h -= 2; + } while (h != 0); + } else { // w == 4 + do { + uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride); + uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride); + uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); + uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); + + uint16x8_t abs_diff0 = vabdq_u16(s0, d0); + uint16x8_t abs_diff1 = vabdq_u16(s1, d1); + + sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); + sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); + + dst += 4 * dstride; + src += 4 * sstride; + h -= 4; + } while (h != 0); + } + + return vaddvq_u64(sum); +}
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c index 41f070e..f024c4f 100644 --- a/aom_dsp/arm/intrapred_neon.c +++ b/aom_dsp/arm/intrapred_neon.c
@@ -11,13 +11,17 @@ #include <arm_neon.h> #include <assert.h> +#include <stdint.h> #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/reinterpret_neon.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/intrapred_common.h" //------------------------------------------------------------------------------ @@ -33,7 +37,7 @@ static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t dc) { for (int i = 0; i < h; ++i) { - store_u8_4x1(dst + i * stride, dc, 0); + store_u8_4x1(dst + i * stride, dc); } } @@ -578,7 +582,7 @@ static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t d0) { for (int i = 0; i < h; ++i) { - store_u8_4x1(dst + i * stride, d0, 0); + store_u8_4x1(dst + i * stride, d0); } } @@ -754,14 +758,14 @@ // ----------------------------------------------------------------------------- static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { - store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0); - store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0); - store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2), 0); - store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3), 0); - store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4), 0); - store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5), 0); - store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6), 0); - store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7), 0); + store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); + store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); + store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); + store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); + store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4)); + store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5)); + store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6)); + store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7)); } static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { @@ -858,10 +862,10 @@ const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = load_u8_4x1(left); (void)above; - store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0); - store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0); - store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2), 0); - store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3), 0); + store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); + store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); + store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); + store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); } void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, @@ -1045,17 +1049,6 @@ /* ---------------------P R E D I C T I O N Z 1--------------------------- */ -static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { - { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, - { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, - { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, - { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, - { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, - { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, - { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, - { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } -}; - // Low bit depth functions static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1163,9 +1156,7 @@ // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - const uint16x8_t a16 = vdupq_n_u16(16); const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]); - const uint8x8_t v_32 = vdup_n_u8(32); int x = dx; for (int r = 0; r < W; r++) { @@ -1191,7 +1182,7 @@ shift = vdupq_n_u16((x & 0x3f) >> 1); } uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]); - uint16x8_t a32 = vmlal_u8(a16, a01_128.val[0], v_32); + uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32)); uint16x8_t res = vmlaq_u16(a32, diff, shift); uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]); @@ -1240,17 +1231,10 @@ // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - const uint16x8_t a16 = vdupq_n_u16(16); const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); - const uint8x8_t v_32 = vdup_n_u8(32); - const uint8x16_t v_zero = vdupq_n_u8(0); int x = dx; for (int r = 0; r < W; r++) { - uint16x8x2_t res; - uint16x8_t shift; - uint8x16_t a0_128, a1_128; - int base = x >> frac_bits; int base_max_diff = (max_base_x - base) >> upsample_above; if (base_max_diff <= 0) { @@ -1262,25 +1246,28 @@ if (base_max_diff > H) base_max_diff = H; + uint16x8_t shift; + uint8x16_t a0_128, a1_128; if (upsample_above) { uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base); a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]); - a1_128 = vextq_u8(a0_128, v_zero, 8); - shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1); + a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8); + shift = vdupq_n_u16(x & 0x1f); } else { a0_128 = vld1q_u8(above + base); a1_128 = vld1q_u8(above + base + 1); shift = vdupq_n_u16((x & 0x3f) >> 1); } - uint16x8x2_t diff, a32; - diff.val[0] = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); - diff.val[1] = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); - a32.val[0] = vmlal_u8(a16, vget_low_u8(a0_128), v_32); - a32.val[1] = vmlal_u8(a16, vget_high_u8(a0_128), v_32); - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); uint8x16_t v_temp = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]); dst[r] = vbslq_u8(mask, v_temp, a_mbase_x); @@ -1301,10 +1288,7 @@ } static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon( - int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above, - int dx) { - // here upsample_above is 0 by design of av1_use_intra_edge_upsample - (void)upsample_above; + int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) { const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); @@ -1316,13 +1300,9 @@ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); - const uint16x8_t a16 = vdupq_n_u16(16); - const uint8x8_t v_32 = vdup_n_u8(32); int x = dx; for (int r = 0; r < N; r++) { - uint8x16_t res16[2]; - int base = x >> frac_bits; int base_max_diff = (max_base_x - base); if (base_max_diff <= 0) { @@ -1336,54 +1316,84 @@ uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); + uint8x16_t res16[2]; for (int j = 0, jj = 0; j < 32; j += 16, jj++) { int mdiff = base_max_diff - j; if (mdiff <= 0) { res16[jj] = a_mbase_x; } else { - uint16x8x2_t a32, diff, res; - uint8x16_t a0_128, a1_128; - a0_128 = vld1q_u8(above + base + j); - a1_128 = vld1q_u8(above + base + j + 1); - diff.val[0] = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); - diff.val[1] = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); - a32.val[0] = vmlal_u8(a16, vget_low_u8(a0_128), v_32); - a32.val[1] = vmlal_u8(a16, vget_high_u8(a0_128), v_32); - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift); + uint8x16_t a0_128 = vld1q_u8(above + base + j); + uint8x16_t a1_128 = vld1q_u8(above + base + j + 1); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = + vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); - res16[jj] = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); } } - uint8x16x2_t mask; - - mask.val[0] = vld1q_u8(BaseMask[base_max_diff]); - mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16); - dstvec[r].val[0] = vbslq_u8(mask.val[0], res16[0], a_mbase_x); - dstvec[r].val[1] = vbslq_u8(mask.val[1], res16[1], a_mbase_x); + uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]); + uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16); + dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x); + dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x); x += dx; } } static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, int upsample_above, - int dx) { + const uint8_t *above, int dx) { uint8x16x2_t dstvec[64]; - dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx); + dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx); for (int i = 0; i < N; i++) { vst1q_u8(dst + stride * i, dstvec[i].val[0]); vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]); } } +// clang-format off +static const uint8_t kLoadMaxShuffles[] = { + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; +// clang-format on + +static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr, + int shuffle_idx) { + uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]); + uint8x16_t src = vld1q_u8(ptr); +#if AOM_ARCH_AARCH64 + return vqtbl1q_u8(src, shuffle); +#else + uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } }; + uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle)); + uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle)); + return vcombine_u8(lo, hi); +#endif +} + static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, int upsample_above, - int dx) { - // here upsample_above is 0 by design of av1_use_intra_edge_upsample - (void)upsample_above; + const uint8_t *above, int dx) { const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); @@ -1394,12 +1404,7 @@ // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - const uint16x8_t a16 = vdupq_n_u16(16); const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); - const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x); - const uint8x8_t v_32 = vdup_n_u8(32); - const uint8x16_t v_zero = vdupq_n_u8(0); - const uint8x16_t step = vdupq_n_u8(16); int x = dx; for (int r = 0; r < N; r++, dst += stride) { @@ -1421,28 +1426,37 @@ vcreate_u8(0x0F0E0D0C0B0A0908))); for (int j = 0; j < 64; j += 16) { - int mdif = max_base_x - (base + j); - if (mdif <= 0) { + if (base + j >= max_base_x) { vst1q_u8(dst + j, a_mbase_x); } else { - uint16x8x2_t a32, diff, res; - uint8x16_t a0_128, a1_128, mask128, res128; - a0_128 = vld1q_u8(above + base + j); - a1_128 = vld1q_u8(above + base + 1 + j); - diff.val[0] = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); - diff.val[1] = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); - a32.val[0] = vmlal_u8(a16, vget_low_u8(a0_128), v_32); - a32.val[1] = vmlal_u8(a16, vget_high_u8(a0_128), v_32); - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift); - uint8x16_t v_temp = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + uint8x16_t a0_128; + uint8x16_t a1_128; + if (base + j + 15 >= max_base_x) { + int shuffle_idx = max_base_x - base - j; + a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx); + } else { + a0_128 = vld1q_u8(above + base + j); + } + if (base + j + 16 >= max_base_x) { + int shuffle_idx = max_base_x - base - j - 1; + a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx); + } else { + a1_128 = vld1q_u8(above + base + j + 1); + } - mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero); - res128 = vbslq_u8(mask128, v_temp, a_mbase_x); - vst1q_u8(dst + j, res128); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = + vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); + vst1q_u8(dst + j, + vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5))); - base_inc128 = vaddq_u8(base_inc128, step); + base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16)); } } x += dx; @@ -1466,18 +1480,15 @@ case 16: dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx); break; - case 32: - dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx); - break; - case 64: - dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx); - break; + case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break; + case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break; default: break; } } /* ---------------------P R E D I C T I O N Z 2--------------------------- */ +#if !AOM_ARCH_AARCH64 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = { { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, @@ -1486,17 +1497,322 @@ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } }; +#endif // !AOM_ARCH_AARCH64 -static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero, - int shift_value) { - switch (shift_value) { - case 1: *vec = vext_u8(*v_zero, *vec, 7); break; - case 2: *vec = vext_u8(*v_zero, *vec, 6); break; - case 3: *vec = vext_u8(*v_zero, *vec, 5); break; - default: break; +static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon( + const uint8_t *above, int upsample_above, int dx, int base_x, int y, + uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) { + uint16x4_t r6 = vcreate_u16(0x00C0008000400000); + uint16x4_t ydx = vdup_n_u16(y * dx); + if (upsample_above) { + // Cannot use LD2 here since we only want to load eight bytes, but LD2 can + // only load either 16 or 32. + uint8x8_t v_tmp = vld1_u8(above + base_x); + *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0]; + *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1]; + *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f)); + } else { + *a0_x = load_u8_4x1(above + base_x); + *a1_x = load_u8_4x1(above + base_x + 1); + *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f)); } } +static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x2_t left_vals, +#else + const uint8_t *left, +#endif + int upsample_left, int dy, int r, int min_base_y, int frac_bits_y, + uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) { + int16x4_t dy64 = vdup_n_s16(dy); + int16x4_t v_1234 = vcreate_s16(0x0004000300020001); + int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); + int16x4_t min_base_y64 = vdup_n_s16(min_base_y); + int16x4_t v_r6 = vdup_n_s16(r << 6); + int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); + int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); + + // Values in base_y_c64 range from -2 through 14 inclusive. + base_y_c64 = vmax_s16(base_y_c64, min_base_y64); + +#if AOM_ARCH_AARCH64 + uint8x8_t left_idx0 = + vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] + uint8x8_t left_idx1 = + vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] + + *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0)); + *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1)); +#else // !AOM_ARCH_AARCH64 + DECLARE_ALIGNED(32, int16_t, base_y_c[4]); + + vst1_s16(base_y_c, base_y_c64); + uint8x8_t a0_y_u8 = vdup_n_u8(0); + a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0); + a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2); + a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4); + a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6); + + base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); + vst1_s16(base_y_c, base_y_c64); + uint8x8_t a1_y_u8 = vdup_n_u8(0); + a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0); + a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2); + a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4); + a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6); + + *a0_y = vreinterpret_u16_u8(a0_y_u8); + *a1_y = vreinterpret_u16_u8(a1_y_u8); +#endif // AOM_ARCH_AARCH64 + + if (upsample_left) { + *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f)); + } else { + *shift1 = + vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f)); + } +} + +static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon( + const uint8_t *above, int upsample_above, int dx, int base_x, int y) { + uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), + vcreate_u16(0x0008000700060005)); + uint16x8_t ydx = vdupq_n_u16(y * dx); + uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6); + + uint16x8_t shift0; + uint8x8_t a0_x0; + uint8x8_t a1_x0; + if (upsample_above) { + uint8x8x2_t v_tmp = vld2_u8(above + base_x); + a0_x0 = v_tmp.val[0]; + a1_x0 = v_tmp.val[1]; + shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); + } else { + a0_x0 = vld1_u8(above + base_x); + a1_x0 = vld1_u8(above + base_x + 1); + shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); + } + + uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] + uint16x8_t a32 = + vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 + uint16x8_t res = vmlaq_u16(a32, diff0, shift0); + return vshrn_n_u16(res, 5); +} + +static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x3_t left_vals, +#else + const uint8_t *left, +#endif + int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) { + int16x8_t v_r6 = vdupq_n_s16(r << 6); + int16x8_t dy128 = vdupq_n_s16(dy); + int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); + int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); + + uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), + vcreate_u16(0x0008000700060005)); + int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); + int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); + + // Values in base_y_c128 range from -2 through 31 inclusive. + base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128); + +#if AOM_ARCH_AARCH64 + uint8x16_t left_idx0 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] + uint8x16_t left_idx1 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] + uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); + + uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); + uint8x8_t a0_x1 = vget_low_u8(a01_x); + uint8x8_t a1_x1 = vget_high_u8(a01_x); +#else // !AOM_ARCH_AARCH64 + uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128); + uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128); +#endif // AOM_ARCH_AARCH64 + + uint16x8_t shift1; + if (upsample_left) { + shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f)); + } else { + shift1 = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1); + } + + uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1); + uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32)); + uint16x8_t res = vmlaq_u16(a32, diff1, shift1); + return vshrn_n_u16(res, 5); +} + +static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon( + const uint8_t *above, int dx, int base_x, int y, int j) { + uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)), + vcombine_u16(vcreate_u16(0x000B000A00090008), + vcreate_u16(0x000F000E000D000C)) } }; + uint16x8_t j256 = vdupq_n_u16(j); + uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); + + const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j); + const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1); + uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); + uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); + uint16x8_t shift0 = + vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1); + uint16x8_t shift1 = + vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1); + // a[x+1] - a[x] + uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128)); + uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128)); + // a[x] * 32 + 16 + uint16x8_t a32_0 = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32)); + uint16x8_t a32_1 = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32)); + uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0); + uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1); + return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); +} + +static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x4_t left_vals0, uint8x16x4_t left_vals1, +#else + const uint8_t *left, +#endif + int dy, int r, int j) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_y = -1; + + int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); + int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1); + int16x8_t dy256 = vdupq_n_s16(dy); + uint16x8_t j256 = vdupq_n_u16(j); + + uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)), + vcombine_u16(vcreate_u16(0x000B000A00090008), + vcreate_u16(0x000F000E000D000C)) } }; + uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)), + vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } }; + + int16x8_t v_r6 = vdupq_n_s16(r << 6); + + int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0])); + int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1])); + int16x8_t mul16_lo = vreinterpretq_s16_u16( + vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)), + vreinterpretq_u16_s16(half_min_base_y256))); + int16x8_t mul16_hi = vreinterpretq_s16_u16( + vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)), + vreinterpretq_u16_s16(half_min_base_y256))); + int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo); + int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi); + + int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6); + int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6); + + base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo); + base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi); + +#if !AOM_ARCH_AARCH64 + int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7); + int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0); + int16_t offset_diff = max_y - min_y; + + uint8x8_t a0_y0; + uint8x8_t a0_y1; + uint8x8_t a1_y0; + uint8x8_t a1_y1; + if (offset_diff < 16) { + // Avoid gathers where the data we want is close together in memory. + // We don't need this for AArch64 since we can already use TBL to cover the + // full range of possible values. + assert(offset_diff >= 0); + int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3); + + int16x8x2_t base_y_offset; + base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256); + base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256); + + int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]), + vqmovn_s16(base_y_offset.val[1])); + + uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); + uint8x16_t a0_y128 = vld1q_u8(left + min_y); + uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1); + a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); + a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); + + uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); + uint8x8_t v_index_high = + vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); + uint8x8x2_t v_tmp, v_res; + v_tmp.val[0] = vget_low_u8(a0_y128); + v_tmp.val[1] = vget_high_u8(a0_y128); + v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); + v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); + a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); + v_tmp.val[0] = vget_low_u8(a1_y128); + v_tmp.val[1] = vget_high_u8(a1_y128); + v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); + v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); + a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); + + a0_y0 = vget_low_u8(a0_y128); + a0_y1 = vget_high_u8(a0_y128); + a1_y0 = vget_low_u8(a1_y128); + a1_y1 = vget_high_u8(a1_y128); + } else { + a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo); + a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi); + a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo); + a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi); + } +#else + // Values in left_idx{0,1} range from 0 through 63 inclusive. + uint8x16_t left_idx0 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1))); + uint8x16_t left_idx1 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1))); + uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); + + uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); + uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); + + uint8x8_t a0_y0 = vget_low_u8(a0_y01); + uint8x8_t a0_y1 = vget_high_u8(a0_y01); + uint8x8_t a1_y0 = vget_low_u8(a1_y01); + uint8x8_t a1_y1 = vget_high_u8(a1_y01); +#endif // !AOM_ARCH_AARCH64 + + uint16x8_t shifty_lo = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1); + uint16x8_t shifty_hi = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1); + + // a[x+1] - a[x] + uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0); + uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1); + // a[x] * 32 + 16 + uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32)); + uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32)); + + uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo); + uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi); + + return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); +} + static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, @@ -1513,20 +1829,6 @@ // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - uint16x8_t a0_x, a1_x, a32, diff; - uint16x8_t v_32 = vdupq_n_u16(32); - uint16x8_t v_zero = vdupq_n_u16(0); - uint16x8_t a16 = vdupq_n_u16(16); - - uint8x8_t v_zero_u8 = vdup_n_u8(0); - uint16x4_t v_c3f = vdup_n_u16(0x3f); - uint16x4_t r6 = vcreate_u16(0x00C0008000400000); - int16x4_t v_upsample_left = vdup_n_s16(upsample_left); - int16x4_t v_upsample_above = vdup_n_s16(upsample_above); - int16x4_t v_1234 = vcreate_s16(0x0004000300020001); - int16x4_t dy64 = vdup_n_s16(dy); - int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); - int16x4_t min_base_y64 = vdup_n_s16(min_base_y); #if AOM_ARCH_AARCH64 // Use ext rather than loading left + 14 directly to avoid over-read. @@ -1534,140 +1836,76 @@ const uint8x16_t left_0 = vld1q_u8(left); const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14); const uint8x16x2_t left_vals = { { left_m2, left_14 } }; +#define LEFT left_vals +#else // !AOM_ARCH_AARCH64 +#define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < N; r++) { - uint16x8_t res, shift; - uint8x8_t resx, resy; - uint16x4x2_t v_shift; - v_shift.val[1] = vdup_n_u16(0); int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; - int base_shift = 0; - if (base_x < (min_base_x - 1)) { - base_shift = (min_base_x - base_x - 1) >> upsample_above; - } - int base_min_diff = - (min_base_x - base_x + upsample_above) >> upsample_above; - if (base_min_diff > 4) { - base_min_diff = 4; + const int base_min_diff = + (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >> + upsample_above; + + if (base_min_diff <= 0) { + uint8x8_t a0_x_u8, a1_x_u8; + uint16x4_t shift0; + dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, + &a0_x_u8, &a1_x_u8, &shift0); + uint8x8_t a0_x = a0_x_u8; + uint8x8_t a1_x = a1_x_u8; + + uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x] + uint16x8_t a32 = + vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16 + uint16x8_t res = + vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0))); + uint8x8_t resx = vshrn_n_u16(res, 5); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0); + } else if (base_min_diff < 4) { + uint8x8_t a0_x_u8, a1_x_u8; + uint16x4_t shift0; + dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, + &a0_x_u8, &a1_x_u8, &shift0); + uint16x8_t a0_x = vmovl_u8(a0_x_u8); + uint16x8_t a1_x = vmovl_u8(a1_x_u8); + + uint16x4_t a0_y; + uint16x4_t a1_y; + uint16x4_t shift1; + dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, + frac_bits_y, &a0_y, &a1_y, &shift1); + a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y); + a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y); + + uint16x8_t shift = vcombine_u16(shift0, shift1); + uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] + uint16x8_t a32 = + vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16 + uint16x8_t res = vmlaq_u16(a32, diff, shift); + uint8x8_t resx = vshrn_n_u16(res, 5); + uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4); + + uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); + uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); } else { - if (base_min_diff < 0) base_min_diff = 0; + uint16x4_t a0_y, a1_y; + uint16x4_t shift1; + dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, + frac_bits_y, &a0_y, &a1_y, &shift1); + uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x] + uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16 + uint16x4_t res = vmla_u16(a32, diff, shift1); + uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0); } - if (base_shift > 3) { - a0_x = v_zero; - a1_x = v_zero; - v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8); - v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8); - } else { - uint16x4_t ydx = vdup_n_u16(y * dx); - - if (upsample_above) { - uint8x8x2_t v_tmp; - v_tmp.val[0] = vld1_u8(above + base_x + base_shift); - v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8); - uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]); - uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8); - a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low)); - a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high)); - v_shift.val[0] = vshr_n_u16( - vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1); - } else { - uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift); - vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift); - uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1); - v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1); - a0_x = vmovl_u8(v_a0_x64); - a1_x = vmovl_u8(v_a1_x64); - } - } - - // y calc - if (base_x < min_base_x) { - int16x4_t v_r6 = vdup_n_s16(r << 6); - int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); - int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); - uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64); - - // Values in base_y_c64 range from -2 through 14 inclusive. - base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64)); - -#if AOM_ARCH_AARCH64 - uint8x8_t left_idx0 = - vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] - uint8x8_t left_idx1 = - vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] - - uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8); - uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8); -#else // !AOM_ARCH_AARCH64 - DECLARE_ALIGNED(32, int16_t, base_y_c[4]); - - vst1_s16(base_y_c, base_y_c64); - uint8x8_t a0_y = vdup_n_u8(0); - a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0); - a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2); - a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4); - a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6); - - base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); - vst1_s16(base_y_c, base_y_c64); - uint8x8_t a1_y = vdup_n_u8(0); - a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0); - a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2); - a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4); - a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6); -#endif // AOM_ARCH_AARCH64 - - if (upsample_left) { - v_shift.val[1] = vshr_n_u16( - vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left), - v_c3f), - 1); - } else { - v_shift.val[1] = - vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1); - } - - a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y)); - a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y)); - } - shift = vcombine_u16(v_shift.val[0], v_shift.val[1]); - diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] - a32 = vmlaq_u16(a16, a0_x, v_32); // a[x] * 32 + 16 - res = vmlaq_u16(a32, diff, shift); - resx = vshrn_n_u16(res, 5); - resy = vext_u8(resx, v_zero_u8, 4); - - uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); - uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); - dst += stride; } -} - -static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero, - int shift_value) { - switch (shift_value) { - case 1: *vec = vextq_u8(*vzero, *vec, 15); break; - case 2: *vec = vextq_u8(*vzero, *vec, 14); break; - case 3: *vec = vextq_u8(*vzero, *vec, 13); break; - case 4: *vec = vextq_u8(*vzero, *vec, 12); break; - case 5: *vec = vextq_u8(*vzero, *vec, 11); break; - case 6: *vec = vextq_u8(*vzero, *vec, 10); break; - case 7: *vec = vextq_u8(*vzero, *vec, 9); break; - case 8: *vec = vextq_u8(*vzero, *vec, 8); break; - case 9: *vec = vextq_u8(*vzero, *vec, 7); break; - case 10: *vec = vextq_u8(*vzero, *vec, 6); break; - case 11: *vec = vextq_u8(*vzero, *vec, 5); break; - case 12: *vec = vextq_u8(*vzero, *vec, 4); break; - case 13: *vec = vextq_u8(*vzero, *vec, 3); break; - case 14: *vec = vextq_u8(*vzero, *vec, 2); break; - case 15: *vec = vextq_u8(*vzero, *vec, 1); break; - default: break; - } +#undef LEFT } static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride, @@ -1685,18 +1923,6 @@ // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - uint16x8x2_t diff, a32; - uint8x16_t v_zero = vdupq_n_u8(0); - int16x8_t v_upsample_left = vdupq_n_s16(upsample_left); - int16x8_t v_upsample_above = vdupq_n_s16(upsample_above); - int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); - - uint16x8_t a16 = vdupq_n_u16(16); - uint16x8_t c3f = vdupq_n_u16(0x3f); - int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); - int16x8_t dy128 = vdupq_n_s16(dy); - uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), - vcreate_u16(0x0008000700060005)); #if AOM_ARCH_AARCH64 // Use ext rather than loading left + 30 directly to avoid over-read. @@ -1706,170 +1932,46 @@ const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14); const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14); const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } }; +#define LEFT left_vals +#else // !AOM_ARCH_AARCH64 +#define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < N; r++) { - uint8x8_t resx, resy, resxy; - uint16x8x2_t res, shift; - shift.val[1] = vdupq_n_u16(0); - int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; - int base_shift = 0; - if (base_x < (min_base_x - 1)) { - base_shift = (min_base_x - base_x - 1) >> upsample_above; - } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; - if (base_min_diff > 8) { - base_min_diff = 8; - } else { - if (base_min_diff < 0) base_min_diff = 0; - } - uint8x8_t a0_x0, a1_x0; - if (base_shift > 7) { - a0_x0 = vdup_n_u8(0); - a1_x0 = vdup_n_u8(0); - shift.val[0] = vreinterpretq_u16_u8(v_zero); - shift.val[1] = vreinterpretq_u16_u8(v_zero); - } else { - uint16x8_t ydx = vdupq_n_u16(y * dx); - uint16x8_t r6 = - vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6); - - if (upsample_above) { - uint8x8x2_t v_tmp; - v_tmp.val[0] = vld1_u8(above + base_x + base_shift); - v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8); - uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]); - uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8); - shift.val[0] = vshrq_n_u16( - vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1); - a0_x0 = vtbl2_u8(v_tmp, v_index_low); - a1_x0 = vtbl2_u8(v_tmp, v_index_high); - } else { - uint8x16_t a0_x128, a1_x128; - a0_x128 = vld1q_u8(above + base_x + base_shift); - a1_x128 = vextq_u8(a0_x128, v_zero, 1); - vector_shuffle(&a0_x128, &v_zero, base_shift); - vector_shuffle(&a1_x128, &v_zero, base_shift); - shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1); - a0_x0 = vget_low_u8(a0_x128); - a1_x0 = vget_low_u8(a1_x128); - } - } - - diff.val[0] = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] - a32.val[0] = vmlal_u8(a16, a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]); - resx = vshrn_n_u16(res.val[0], 5); - - // y calc - if (base_x < min_base_x) { - int16x8_t y_c128, base_y_c128; - uint16x8_t mask128; - int16x8_t v_r6 = vdupq_n_s16(r << 6); - - y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); - base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); - mask128 = vcgtq_s16(min_base_y128, base_y_c128); - - // Values in base_y_c128 range from -2 through 31 inclusive. - base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128)); - -#if AOM_ARCH_AARCH64 - uint8x16_t left_idx0 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] - uint8x16_t left_idx1 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] - uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); - - uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); - uint8x8_t a0_x1 = vget_low_u8(a01_x); - uint8x8_t a1_x1 = vget_high_u8(a01_x); -#else // !AOM_ARCH_AARCH64 - DECLARE_ALIGNED(32, int16_t, base_y_c[16]); - - vst1q_s16(base_y_c, base_y_c128); - uint8x8_t a0_x1 = vdup_n_u8(0); - a0_x1 = vld1_lane_u8(left + base_y_c[0], a0_x1, 0); - a0_x1 = vld1_lane_u8(left + base_y_c[1], a0_x1, 1); - a0_x1 = vld1_lane_u8(left + base_y_c[2], a0_x1, 2); - a0_x1 = vld1_lane_u8(left + base_y_c[3], a0_x1, 3); - a0_x1 = vld1_lane_u8(left + base_y_c[4], a0_x1, 4); - a0_x1 = vld1_lane_u8(left + base_y_c[5], a0_x1, 5); - a0_x1 = vld1_lane_u8(left + base_y_c[6], a0_x1, 6); - a0_x1 = vld1_lane_u8(left + base_y_c[7], a0_x1, 7); - - base_y_c128 = vaddq_s16(base_y_c128, vdupq_n_s16(1)); - vst1q_s16(base_y_c, base_y_c128); - uint8x8_t a1_x1 = vdup_n_u8(0); - a1_x1 = vld1_lane_u8(left + base_y_c[0], a1_x1, 0); - a1_x1 = vld1_lane_u8(left + base_y_c[1], a1_x1, 1); - a1_x1 = vld1_lane_u8(left + base_y_c[2], a1_x1, 2); - a1_x1 = vld1_lane_u8(left + base_y_c[3], a1_x1, 3); - a1_x1 = vld1_lane_u8(left + base_y_c[4], a1_x1, 4); - a1_x1 = vld1_lane_u8(left + base_y_c[5], a1_x1, 5); - a1_x1 = vld1_lane_u8(left + base_y_c[6], a1_x1, 6); - a1_x1 = vld1_lane_u8(left + base_y_c[7], a1_x1, 7); -#endif // AOM_ARCH_AARCH64 - - if (upsample_left) { - shift.val[1] = vshrq_n_u16( - vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left), - c3f), - 1); - } else { - shift.val[1] = - vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1); - } - - diff.val[1] = vsubl_u8(a1_x1, a0_x1); - a32.val[1] = vmlal_u8(a16, a0_x1, vdup_n_u8(32)); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]); - resy = vshrn_n_u16(res.val[1], 5); + if (base_min_diff <= 0) { + uint8x8_t resx = + dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); + vst1_u8(dst, resx); + } else if (base_min_diff < 8) { + uint8x8_t resx = + dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); + uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( + LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); - resxy = vbsl_u8(mask, resy, resx); + uint8x8_t resxy = vbsl_u8(mask, resy, resx); vst1_u8(dst, resxy); } else { - vst1_u8(dst, resx); + uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( + LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); + vst1_u8(dst, resy); } dst += stride; } +#undef LEFT } static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int upsample_above, - int upsample_left, int dx, int dy) { + const uint8_t *left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; - const int min_base_y = -1; - (void)upsample_above; - (void)upsample_left; - const int frac_bits_x = 6; - const int frac_bits_y = 6; - - uint16x8x2_t a32, c0123, c1234, diff, shifty; - uint8x16x2_t a0_x, a1_x; - uint16x8_t v_32 = vdupq_n_u16(32); - uint8x16_t v_zero = vdupq_n_u8(0); - int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); - - uint16x8_t a16 = vdupq_n_u16(16); - uint16x8_t c1 = vshrq_n_u16(a16, 4); - int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); - uint16x8_t c3f = vdupq_n_u16(0x3f); - int16x8_t dy256 = vdupq_n_s16(dy); - c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000), - vcreate_u16(0x0007000600050004)); - c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008), - vcreate_u16(0x000F000E000D000C)); - c1234.val[0] = vaddq_u16(c0123.val[0], c1); - c1234.val[1] = vaddq_u16(c0123.val[1], c1); #if AOM_ARCH_AARCH64 const uint8x16_t left_m1 = vld1q_u8(left - 1); @@ -1882,241 +1984,36 @@ const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15); const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } }; const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } }; +#define LEFT left_vals0, left_vals1 +#else // !AOM_ARCH_AARCH64 +#define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < H; r++) { - uint16x8x2_t res, r6, shift; - uint16x8_t j256; - uint8x16_t resx, resy, resxy; int y = r + 1; - uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); - - int base_x = (-y * dx) >> frac_bits_x; + int base_x = (-y * dx) >> 6; for (int j = 0; j < W; j += 16) { - j256 = vdupq_n_u16(j); + const int base_min_diff = min_base_x - base_x - j; - int base_shift = 0; - if ((base_x + j) < (min_base_x - 1)) { - base_shift = (min_base_x - (base_x + j) - 1); - } - int base_min_diff = (min_base_x - base_x - j); - if (base_min_diff > 16) { - base_min_diff = 16; + if (base_min_diff <= 0) { + uint8x16_t resx = + dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); + vst1q_u8(dst + j, resx); + } else if (base_min_diff < 16) { + uint8x16_t resx = + dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); + uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); + uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); + uint8x16_t resxy = vbslq_u8(mask, resy, resx); + vst1q_u8(dst + j, resxy); } else { - if (base_min_diff < 0) base_min_diff = 0; + uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); + vst1q_u8(dst + j, resy); } - - if (base_shift < 16) { - uint8x16_t a0_x128, a1_x128; - a0_x128 = vld1q_u8(above + base_x + base_shift + j); - a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j); - vector_shuffle(&a0_x128, &v_zero, base_shift); - vector_shuffle(&a1_x128, &v_zero, base_shift); - a0_x = vzipq_u8(a0_x128, v_zero); - a1_x = vzipq_u8(a1_x128, v_zero); - r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); - r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); - shift.val[0] = - vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1); - shift.val[1] = - vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1); - diff.val[0] = - vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]), - vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x] - diff.val[1] = - vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]), - vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x] - a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]), - v_32); // a[x] * 32 + 16 - a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]), - v_32); // a[x] * 32 + 16 - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]); - resx = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); - } else { - resx = v_zero; - } - - // y calc - if (base_x < min_base_x) { - uint16x8x2_t mask256; - int16x8x2_t c256, y_c256, base_y_c256, mul16; - int16x8_t v_r6 = vdupq_n_s16(r << 6); - - c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256), - vreinterpretq_s16_u16(c1234.val[0])); - c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256), - vreinterpretq_s16_u16(c1234.val[1])); - mul16.val[0] = vreinterpretq_s16_u16( - vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256.val[0], dy256)), - vshrq_n_u16(vreinterpretq_u16_s16(min_base_y256), 1))); - mul16.val[1] = vreinterpretq_s16_u16( - vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256.val[1], dy256)), - vshrq_n_u16(vreinterpretq_u16_s16(min_base_y256), 1))); - y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]); - y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]); - - base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y); - base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y); - mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]); - mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]); - - base_y_c256.val[0] = - vbslq_s16(mask256.val[0], min_base_y256, base_y_c256.val[0]); - base_y_c256.val[1] = - vbslq_s16(mask256.val[1], min_base_y256, base_y_c256.val[1]); - - int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7); - int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0); - int16_t offset_diff = max_y - min_y; - - uint8x8_t a0_y0; - uint8x8_t a0_y1; - uint8x8_t a1_y0; - uint8x8_t a1_y1; - - if (offset_diff < 16) { - assert(offset_diff >= 0); - int16x8_t min_y256 = - vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3); - - int16x8x2_t base_y_offset; - base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256); - base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256); - - int8x16_t base_y_offset128 = - vcombine_s8(vqmovn_s16(base_y_offset.val[0]), - vqmovn_s16(base_y_offset.val[1])); - - uint8x16_t a0_y128, a1_y128; - uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); - a0_y128 = vld1q_u8(left + min_y); - a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); - a1_y128 = vld1q_u8(left + min_y + 1); - a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); -#if AOM_ARCH_AARCH64 - a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128)); - a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128)); -#else - uint8x8x2_t v_tmp; - uint8x8x2_t v_res; - uint8x8_t v_index_low = - vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); - uint8x8_t v_index_high = - vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); - v_tmp.val[0] = vget_low_u8(a0_y128); - v_tmp.val[1] = vget_high_u8(a0_y128); - v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); - v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); - a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); - v_tmp.val[0] = vget_low_u8(a1_y128); - v_tmp.val[1] = vget_high_u8(a1_y128); - v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); - v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); - a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); -#endif - a0_y0 = vget_low_u8(a0_y128); - a0_y1 = vget_high_u8(a0_y128); - a1_y0 = vget_low_u8(a1_y128); - a1_y1 = vget_high_u8(a1_y128); - } else { - // Values in base_y_c256 range from -1 through 62 inclusive. - base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0], - vreinterpretq_s16_u16(mask256.val[0])); - base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1], - vreinterpretq_s16_u16(mask256.val[1])); - -#if AOM_ARCH_AARCH64 - // Values in left_idx{0,1} range from 0 through 63 inclusive. - uint8x16_t left_idx0 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c256.val[0], vdupq_n_s16(1))); - uint8x16_t left_idx1 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c256.val[1], vdupq_n_s16(1))); - - uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); - - uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); - uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); - - a0_y0 = vget_low_u8(a0_y01); - a0_y1 = vget_high_u8(a0_y01); - a1_y0 = vget_low_u8(a1_y01); - a1_y1 = vget_high_u8(a1_y01); -#else // !AOM_ARCH_AARCH64 - DECLARE_ALIGNED(32, int16_t, base_y_c[16]); - - vst1q_s16(base_y_c, base_y_c256.val[0]); - vst1q_s16(base_y_c + 8, base_y_c256.val[1]); - a0_y0 = vdup_n_u8(0); - a0_y0 = vld1_lane_u8(left + base_y_c[0], a0_y0, 0); - a0_y0 = vld1_lane_u8(left + base_y_c[1], a0_y0, 1); - a0_y0 = vld1_lane_u8(left + base_y_c[2], a0_y0, 2); - a0_y0 = vld1_lane_u8(left + base_y_c[3], a0_y0, 3); - a0_y0 = vld1_lane_u8(left + base_y_c[4], a0_y0, 4); - a0_y0 = vld1_lane_u8(left + base_y_c[5], a0_y0, 5); - a0_y0 = vld1_lane_u8(left + base_y_c[6], a0_y0, 6); - a0_y0 = vld1_lane_u8(left + base_y_c[7], a0_y0, 7); - a0_y1 = vdup_n_u8(0); - a0_y1 = vld1_lane_u8(left + base_y_c[8], a0_y1, 0); - a0_y1 = vld1_lane_u8(left + base_y_c[9], a0_y1, 1); - a0_y1 = vld1_lane_u8(left + base_y_c[10], a0_y1, 2); - a0_y1 = vld1_lane_u8(left + base_y_c[11], a0_y1, 3); - a0_y1 = vld1_lane_u8(left + base_y_c[12], a0_y1, 4); - a0_y1 = vld1_lane_u8(left + base_y_c[13], a0_y1, 5); - a0_y1 = vld1_lane_u8(left + base_y_c[14], a0_y1, 6); - a0_y1 = vld1_lane_u8(left + base_y_c[15], a0_y1, 7); - - base_y_c256.val[0] = - vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1)); - base_y_c256.val[1] = - vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1)); - - vst1q_s16(base_y_c, base_y_c256.val[0]); - vst1q_s16(base_y_c + 8, base_y_c256.val[1]); - a1_y0 = vdup_n_u8(0); - a1_y0 = vld1_lane_u8(left + base_y_c[0], a1_y0, 0); - a1_y0 = vld1_lane_u8(left + base_y_c[1], a1_y0, 1); - a1_y0 = vld1_lane_u8(left + base_y_c[2], a1_y0, 2); - a1_y0 = vld1_lane_u8(left + base_y_c[3], a1_y0, 3); - a1_y0 = vld1_lane_u8(left + base_y_c[4], a1_y0, 4); - a1_y0 = vld1_lane_u8(left + base_y_c[5], a1_y0, 5); - a1_y0 = vld1_lane_u8(left + base_y_c[6], a1_y0, 6); - a1_y0 = vld1_lane_u8(left + base_y_c[7], a1_y0, 7); - a1_y1 = vdup_n_u8(0); - a1_y1 = vld1_lane_u8(left + base_y_c[8], a1_y1, 0); - a1_y1 = vld1_lane_u8(left + base_y_c[9], a1_y1, 1); - a1_y1 = vld1_lane_u8(left + base_y_c[10], a1_y1, 2); - a1_y1 = vld1_lane_u8(left + base_y_c[11], a1_y1, 3); - a1_y1 = vld1_lane_u8(left + base_y_c[12], a1_y1, 4); - a1_y1 = vld1_lane_u8(left + base_y_c[13], a1_y1, 5); - a1_y1 = vld1_lane_u8(left + base_y_c[14], a1_y1, 6); - a1_y1 = vld1_lane_u8(left + base_y_c[15], a1_y1, 7); -#endif // AOM_ARCH_AARCH64 - } - - shifty.val[0] = vshrq_n_u16( - vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1); - shifty.val[1] = vshrq_n_u16( - vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1); - diff.val[0] = vsubl_u8(a1_y0, a0_y0); // a[x+1] - a[x] - diff.val[1] = vsubl_u8(a1_y1, a0_y1); // a[x+1] - a[x] - a32.val[0] = vmlal_u8(a16, a0_y0, vdup_n_u8(32)); // a[x] * 32 + 16 - a32.val[1] = vmlal_u8(a16, a0_y1, vdup_n_u8(32)); // a[x] * 32 + 16 - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]); - - resy = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); - } else { - resy = v_zero; - } - uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); - resxy = vbslq_u8(mask, resy, resx); - vst1q_u8(dst + j, resxy); } // for j dst += stride; } +#undef LEFT } // Directional prediction, zone 2: 90 < angle < 180 @@ -2137,626 +2034,67 @@ upsample_left, dx, dy); break; default: - dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, - upsample_above, upsample_left, dx, dy); + dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy); break; } } /* ---------------------P R E D I C T I O N Z 3--------------------------- */ -static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x, - uint16x8x2_t *d) { - uint8x16x2_t w0, w1; +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x, + uint8x16x2_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); - w0 = vzipq_u8(x[0], x[1]); - w1 = vzipq_u8(x[2], x[3]); - - d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); + d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0]))); + d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1]))); } -static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x, - uint16x4x2_t *d) { - uint8x8x2_t w0, w1; +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x, + uint8x8x2_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - - *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); + *d = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); } -static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x, - uint16x4x2_t *d) { - uint8x8x2_t w0, w1; +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x, + uint8x8x2_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - - d[0] = - vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - d[1] = - vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); + d[0] = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); + d[1] = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]))); } -static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x, - uint32x2x2_t *d) { - uint8x8x2_t w0, w1, w2, w3; - uint16x4x2_t w4, w5; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - w2 = vzip_u8(x[4], x[5]); - w3 = vzip_u8(x[6], x[7]); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); - - d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); -} - -static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) { - uint8x8x2_t w0, w1, w2, w3; - uint16x4x2_t w4, w5, w6, w7; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - w2 = vzip_u8(x[4], x[5]); - w3 = vzip_u8(x[6], x[7]); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); - - d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); - - w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); - w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); - - d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]), - vreinterpret_u32_u16(w7.val[0])); - d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]), - vreinterpret_u32_u16(w7.val[1])); -} - -static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x, - uint64x2_t *d) { - uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11; - uint16x4x2_t w4, w5, w12, w13; - uint32x2x2_t w6, w7, w14, w15; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - w2 = vzip_u8(x[4], x[5]); - w3 = vzip_u8(x[6], x[7]); - - w8 = vzip_u8(x[8], x[9]); - w9 = vzip_u8(x[10], x[11]); - w10 = vzip_u8(x[12], x[13]); - w11 = vzip_u8(x[14], x[15]); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); - w12 = - vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); - w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), - vreinterpret_u16_u8(w11.val[0])); - - w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); - w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), - vreinterpret_u32_u16(w13.val[0])); - w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), - vreinterpret_u32_u16(w13.val[1])); - - // Store first 4-line result - d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]), - vreinterpret_u64_u32(w14.val[0])); - d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]), - vreinterpret_u64_u32(w14.val[1])); - d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]), - vreinterpret_u64_u32(w15.val[0])); - d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]), - vreinterpret_u64_u32(w15.val[1])); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); - w12 = - vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); - w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), - vreinterpret_u16_u8(w11.val[1])); - - w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); - w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), - vreinterpret_u32_u16(w13.val[0])); - w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), - vreinterpret_u32_u16(w13.val[1])); - - // Store second 4-line result - d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]), - vreinterpret_u64_u32(w14.val[0])); - d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]), - vreinterpret_u64_u32(w14.val[1])); - d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]), - vreinterpret_u64_u32(w15.val[0])); - d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]), - vreinterpret_u64_u32(w15.val[1])); -} - -static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x, - uint64x2_t *d) { - uint8x16x2_t w0, w1, w2, w3; - uint16x8x2_t w4, w5, w6, w7; - uint32x4x2_t w8, w9, w10, w11; - - w0 = vzipq_u8(x[0], x[1]); - w1 = vzipq_u8(x[2], x[3]); - w2 = vzipq_u8(x[4], x[5]); - w3 = vzipq_u8(x[6], x[7]); - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - - w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), - vreinterpretq_u32_u16(w7.val[0])); - w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), - vreinterpretq_u32_u16(w7.val[1])); - -#if AOM_ARCH_AARCH64 - d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]), - vreinterpretq_u64_u32(w9.val[0])); - d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]), - vreinterpretq_u64_u32(w9.val[0])); - d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]), - vreinterpretq_u64_u32(w9.val[1])); - d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]), - vreinterpretq_u64_u32(w9.val[1])); - d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]), - vreinterpretq_u64_u32(w11.val[0])); - d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]), - vreinterpretq_u64_u32(w11.val[0])); - d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]), - vreinterpretq_u64_u32(w11.val[1])); - d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]), - vreinterpretq_u64_u32(w11.val[1])); -#else - d[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0]))); - d[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0]))); - d[2] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1]))); - d[3] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1]))); - d[4] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0]))); - d[5] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0]))); - d[6] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1]))); - d[7] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1]))); -#endif -} - -static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) { - uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7; - uint16x8x2_t w8, w9, w10, w11; - uint32x4x2_t w12, w13, w14, w15; - - w0 = vzipq_u8(x[0], x[1]); - w1 = vzipq_u8(x[2], x[3]); - w2 = vzipq_u8(x[4], x[5]); - w3 = vzipq_u8(x[6], x[7]); - - w4 = vzipq_u8(x[8], x[9]); - w5 = vzipq_u8(x[10], x[11]); - w6 = vzipq_u8(x[12], x[13]); - w7 = vzipq_u8(x[14], x[15]); - - w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), - vreinterpretq_u16_u8(w5.val[0])); - w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), - vreinterpretq_u16_u8(w7.val[0])); - - w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), - vreinterpretq_u32_u16(w9.val[0])); - w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), - vreinterpretq_u32_u16(w11.val[0])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), - vreinterpretq_u32_u16(w9.val[1])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), - vreinterpretq_u32_u16(w11.val[1])); - -#if AOM_ARCH_AARCH64 - d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0]))); - d[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0]))); - d[2] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1]))); - d[3] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1]))); - d[4] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0]))); - d[5] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0]))); - d[6] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1]))); - d[7] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1]))); -#endif - - // upper half - w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), - vreinterpretq_u16_u8(w5.val[1])); - w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), - vreinterpretq_u16_u8(w7.val[1])); - - w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), - vreinterpretq_u32_u16(w9.val[0])); - w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), - vreinterpretq_u32_u16(w11.val[0])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), - vreinterpretq_u32_u16(w9.val[1])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), - vreinterpretq_u32_u16(w11.val[1])); - -#if AOM_ARCH_AARCH64 - d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[8] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0]))); - d[9] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0]))); - d[10] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1]))); - d[11] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1]))); - d[12] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0]))); - d[13] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0]))); - d[14] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1]))); - d[15] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1]))); -#endif -} - -static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x, - uint64x2x2_t *d) { - uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11; - uint16x8x2_t w4, w5, w12, w13; - uint32x4x2_t w6, w7, w14, w15; - - w0 = vzipq_u8(x[0].val[0], x[1].val[0]); - w1 = vzipq_u8(x[2].val[0], x[3].val[0]); - w2 = vzipq_u8(x[4].val[0], x[5].val[0]); - w3 = vzipq_u8(x[6].val[0], x[7].val[0]); - - w8 = vzipq_u8(x[8].val[0], x[9].val[0]); - w9 = vzipq_u8(x[10].val[0], x[11].val[0]); - w10 = vzipq_u8(x[12].val[0], x[13].val[0]); - w11 = vzipq_u8(x[14].val[0], x[15].val[0]); - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]), - vreinterpretq_u16_u8(w9.val[0])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]), - vreinterpretq_u16_u8(w11.val[0])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store first 4-line result - -#if AOM_ARCH_AARCH64 - d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[0].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[0].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[1].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[1].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[2].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[2].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[3].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[3].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]), - vreinterpretq_u16_u8(w9.val[1])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]), - vreinterpretq_u16_u8(w11.val[1])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store second 4-line result - -#if AOM_ARCH_AARCH64 - d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[4].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[4].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[5].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[5].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[6].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[6].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[7].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[7].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif - - // upper half - w0 = vzipq_u8(x[0].val[1], x[1].val[1]); - w1 = vzipq_u8(x[2].val[1], x[3].val[1]); - w2 = vzipq_u8(x[4].val[1], x[5].val[1]); - w3 = vzipq_u8(x[6].val[1], x[7].val[1]); - - w8 = vzipq_u8(x[8].val[1], x[9].val[1]); - w9 = vzipq_u8(x[10].val[1], x[11].val[1]); - w10 = vzipq_u8(x[12].val[1], x[13].val[1]); - w11 = vzipq_u8(x[14].val[1], x[15].val[1]); - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]), - vreinterpretq_u16_u8(w9.val[0])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]), - vreinterpretq_u16_u8(w11.val[0])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store first 4-line result - -#if AOM_ARCH_AARCH64 - d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[8].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[8].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[9].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[9].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[10].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[10].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[11].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[11].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]), - vreinterpretq_u16_u8(w9.val[1])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]), - vreinterpretq_u16_u8(w11.val[1])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store second 4-line result - -#if AOM_ARCH_AARCH64 - d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[12].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[12].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[13].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[13].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[14].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[14].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[15].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[15].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif -} - -static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, - uint8_t *dst, ptrdiff_t pitchDst) { +static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + // The same as the normal transposes in transpose_neon.h, but with a stride + // between consecutive vectors of elements. uint8x16_t r[16]; - uint64x2_t d[16]; + uint8x16_t d[16]; for (int i = 0; i < 16; i++) { r[i] = vld1q_u8(src + i * pitchSrc); } - transpose16x16_neon(r, d); + transpose_arrays_u8_16x16(r, d); for (int i = 0; i < 16; i++) { - vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * pitchDst, d[i]); } } -static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, - ptrdiff_t pitchDst, int width, int height) { +static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src, + ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, + int height) { for (int j = 0; j < height; j += 16) { for (int i = 0; i < width; i += 16) { - transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, - dst + j * pitchDst + i, pitchDst); + z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); } } } @@ -2765,89 +2103,60 @@ const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[4]; - uint16x4x2_t dest; + uint8x8x2_t dest; dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy); - transpose4x8_8x4_low_neon(dstvec, &dest); - vst1_lane_u32((uint32_t *)(dst + stride * 0), - vreinterpret_u32_u16(dest.val[0]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 1), - vreinterpret_u32_u16(dest.val[0]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 2), - vreinterpret_u32_u16(dest.val[1]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 3), - vreinterpret_u32_u16(dest.val[1]), 1); + z3_transpose_arrays_u8_4x4(dstvec, &dest); + store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]); + store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]); } static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[8]; - uint32x2x2_t d[4]; + uint8x8_t d[8]; dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy); - transpose8x8_neon(dstvec, d); - vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]); - vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]); - vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]); - vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]); - vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]); - vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]); - vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]); - vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]); + transpose_arrays_u8_8x8(dstvec, d); + store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); } static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[4]; - uint16x4x2_t d[2]; + uint8x8x2_t d[2]; dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy); - transpose4x8_8x4_neon(dstvec, d); - vst1_lane_u32((uint32_t *)(dst + stride * 0), - vreinterpret_u32_u16(d[0].val[0]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 1), - vreinterpret_u32_u16(d[0].val[0]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 2), - vreinterpret_u32_u16(d[0].val[1]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 3), - vreinterpret_u32_u16(d[0].val[1]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 4), - vreinterpret_u32_u16(d[1].val[0]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 5), - vreinterpret_u32_u16(d[1].val[0]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 6), - vreinterpret_u32_u16(d[1].val[1]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 7), - vreinterpret_u32_u16(d[1].val[1]), 1); + z3_transpose_arrays_u8_8x4(dstvec, d); + store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]); + store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]); + store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]); + store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]); } static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[8]; - uint32x2x2_t d[2]; + uint8x8_t d[8]; dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy); - transpose8x8_low_neon(dstvec, d); - vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]); - vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]); - vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]); - vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]); + transpose_arrays_u8_8x8(dstvec, d); + store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]); } static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[8]; - uint64x2_t d[8]; + uint8x8_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy); - transpose8x16_16x8_neon(dstvec, d); - for (int i = 0; i < 8; i++) { - vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i]))); - vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i]))); + transpose_arrays_u8_16x8(dstvec, d); + for (int i = 0; i < 16; i++) { + vst1_u8(dst + i * stride, d[i]); } } @@ -2855,12 +2164,12 @@ const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[16]; - uint64x2_t d[8]; + uint8x16_t d[8]; dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy); - transpose16x8_8x16_neon(dstvec, d); + transpose_arrays_u8_8x16(dstvec, d); for (int i = 0; i < 8; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * stride, d[i]); } } @@ -2868,78 +2177,45 @@ const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[4]; - uint16x8x2_t d[2]; + uint8x16x2_t d[2]; dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy); - transpose4x16_neon(dstvec, d); - vst1q_lane_u32((uint32_t *)(dst + stride * 0), - vreinterpretq_u32_u16(d[0].val[0]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 1), - vreinterpretq_u32_u16(d[0].val[0]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 2), - vreinterpretq_u32_u16(d[0].val[0]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 3), - vreinterpretq_u32_u16(d[0].val[0]), 3); - - vst1q_lane_u32((uint32_t *)(dst + stride * 4), - vreinterpretq_u32_u16(d[0].val[1]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 5), - vreinterpretq_u32_u16(d[0].val[1]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 6), - vreinterpretq_u32_u16(d[0].val[1]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 7), - vreinterpretq_u32_u16(d[0].val[1]), 3); - - vst1q_lane_u32((uint32_t *)(dst + stride * 8), - vreinterpretq_u32_u16(d[1].val[0]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 9), - vreinterpretq_u32_u16(d[1].val[0]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 10), - vreinterpretq_u32_u16(d[1].val[0]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 11), - vreinterpretq_u32_u16(d[1].val[0]), 3); - - vst1q_lane_u32((uint32_t *)(dst + stride * 12), - vreinterpretq_u32_u16(d[1].val[1]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 13), - vreinterpretq_u32_u16(d[1].val[1]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 14), - vreinterpretq_u32_u16(d[1].val[1]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 15), - vreinterpretq_u32_u16(d[1].val[1]), 3); + z3_transpose_arrays_u8_16x4(dstvec, d); + store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]); + store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]); + store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]); + store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]); } static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[16]; - uint64x2_t d[8]; + uint8x16_t d[8]; dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy); - transpose16x8_8x16_neon(dstvec, d); + transpose_arrays_u8_8x16(dstvec, d); for (int i = 0; i < 4; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8x16x2_t dstvec[16]; - uint64x2x2_t d[16]; + uint8x16_t d[32]; uint8x16_t v_zero = vdupq_n_u8(0); - dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy); + dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy); for (int i = 8; i < 16; i++) { dstvec[i].val[0] = v_zero; dstvec[i].val[1] = v_zero; } - transpose16x32_neon(dstvec, d); - for (int i = 0; i < 16; i++) { - vst1_u8(dst + 2 * i * stride, - vreinterpret_u8_u64(vget_low_u64(d[i].val[0]))); - vst1_u8(dst + (2 * i + 1) * stride, - vreinterpret_u8_u64(vget_low_u64(d[i].val[1]))); + transpose_arrays_u8_32x16(dstvec, d); + for (int i = 0; i < 32; i++) { + vst1_u8(dst + i * stride, vget_low_u8(d[i])); } } @@ -2947,14 +2223,14 @@ const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[32]; - uint64x2_t d[16]; + uint8x16_t d[16]; dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy); - transpose16x8_8x16_neon(dstvec, d); - transpose16x8_8x16_neon(dstvec + 16, d + 8); + transpose_arrays_u8_8x16(dstvec, d); + transpose_arrays_u8_8x16(dstvec + 16, d + 8); for (int i = 0; i < 8; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); - vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8])); + vst1q_u8(dst + i * stride, d[i]); + vst1q_u8(dst + i * stride + 16, d[i + 8]); } } @@ -2962,53 +2238,53 @@ const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[16]; - uint64x2_t d[16]; + uint8x16_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy); - transpose16x16_neon(dstvec, d); + transpose_arrays_u8_16x16(dstvec, d); for (int i = 0; i < 16; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8x16x2_t dstvec[32]; - uint64x2x2_t d[32]; + uint8x16_t d[64]; - dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy); - transpose16x32_neon(dstvec, d); - transpose16x32_neon(dstvec + 16, d + 16); - for (int i = 0; i < 16; i++) { - vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0])); - vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0])); - vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1])); - vst1q_u8(dst + (2 * i + 1) * stride + 16, - vreinterpretq_u8_u64(d[i + 16].val[1])); + dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy); + transpose_arrays_u8_32x16(dstvec, d); + transpose_arrays_u8_32x16(dstvec + 16, d + 32); + for (int i = 0; i < 32; i++) { + vst1q_u8(dst + i * stride, d[i]); + vst1q_u8(dst + i * stride + 16, d[i + 32]); } } static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); - dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 64, 64); + dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64); } static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8x16x2_t dstvec[16]; - uint64x2x2_t d[16]; + uint8x16_t d[32]; - dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy); - transpose16x32_neon(dstvec, d); + dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy); + transpose_arrays_u8_32x16(dstvec, d); for (int i = 0; i < 16; i++) { - vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0])); - vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1])); + vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]); + vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]); } } @@ -3016,13 +2292,13 @@ const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[32]; - uint64x2_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy); for (int i = 0; i < 32; i += 16) { - transpose16x16_neon((dstvec + i), d); + uint8x16_t d[16]; + transpose_arrays_u8_16x16(dstvec + i, d); for (int j = 0; j < 16; j++) { - vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j])); + vst1q_u8(dst + j * stride + i, d[j]); } } } @@ -3030,45 +2306,68 @@ static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8_t dstT[64 * 32]; - dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 32, 64); + dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64); } static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8_t dstT[32 * 64]; - dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy); - transpose(dstT, 32, dst, stride, 64, 32); + dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32); } static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8_t dstT[64 * 16]; - dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 16, 64); + dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64); } static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[64]; - uint64x2_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy); for (int i = 0; i < 64; i += 16) { - transpose16x16_neon((dstvec + i), d); - for (int j = 0; j < 16; j++) { - vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j])); + uint8x16_t d[16]; + transpose_arrays_u8_16x16(dstvec + i, d); + for (int j = 0; j < 16; ++j) { + vst1q_u8(dst + j * stride + i, d[j]); } } } +typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy); + +static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, + dr_prediction_z3_4x16_neon, NULL, NULL }, + { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, + dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL }, + { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon, + dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon, + dr_prediction_z3_16x64_neon }, + { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon, + dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, + { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon, + dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon }, +}; + void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy) { @@ -3077,85 +2376,9 @@ assert(dx == 1); assert(dy > 0); - if (bw == bh) { - switch (bw) { - case 4: - dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy); - break; - case 32: - dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy); - break; - case 64: - dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy); - break; - } - } else { - if (bw < bh) { - if (bw + bw == bh) { - switch (bw) { - case 4: - dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy); - break; - case 32: - dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy); - break; - } - } else { - switch (bw) { - case 4: - dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy); - break; - } - } - } else { - if (bh + bh == bw) { - switch (bh) { - case 4: - dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy); - break; - case 32: - dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy); - break; - } - } else { - switch (bh) { - case 4: - dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy); - break; - } - } - } - } + dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)]; + assert(f != NULL); + f(dst, stride, left, upsample_left, dy); } // ----------------------------------------------------------------------------- @@ -3759,7 +2982,7 @@ result = vbsl_u8(left_or_top_mask, result, top_left); if (width == 4) { - store_unaligned_u8_4x1(dest, result, 0); + store_u8_4x1(dest, result); } else { // width == 8 vst1_u8(dest, result); }
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c index 0e683a7..7c64be1 100644 --- a/aom_dsp/arm/loopfilter_neon.c +++ b/aom_dsp/arm/loopfilter_neon.c
@@ -692,7 +692,7 @@ row2 = vcombine_u8(p5p1, q2q6); row3 = vcombine_u8(p4p0, q3qy); - store_u8_8x16(src - 8, stride, row0, row1, row2, row3); + store_u8_16x4(src - 8, stride, row0, row1, row2, row3); } void aom_lpf_vertical_14_dual_neon( @@ -862,10 +862,8 @@ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1); - store_unaligned_u8_4x1(src - 2, p1p0, 0); - store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0); - store_unaligned_u8_4x1((src - 2) + 2 * stride, p1p0, 1); - store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1); + store_u8x4_strided_x2(src - 2, 2 * stride, p1p0); + store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1); } void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, @@ -897,18 +895,12 @@ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); - store_u8_4x1(src - 6 * stride, p5q5, 0); - store_u8_4x1(src - 5 * stride, p4q4, 0); - store_u8_4x1(src - 4 * stride, p3q3, 0); - store_u8_4x1(src - 3 * stride, p2q2, 0); - store_u8_4x1(src - 2 * stride, p1q1, 0); - store_u8_4x1(src - 1 * stride, p0q0, 0); - store_u8_4x1(src + 0 * stride, p0q0, 1); - store_u8_4x1(src + 1 * stride, p1q1, 1); - store_u8_4x1(src + 2 * stride, p2q2, 1); - store_u8_4x1(src + 3 * stride, p3q3, 1); - store_u8_4x1(src + 4 * stride, p4q4, 1); - store_u8_4x1(src + 5 * stride, p5q5, 1); + store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); + store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); + store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2); + store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3); + store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4); + store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5); } void aom_lpf_horizontal_14_dual_neon( @@ -1029,10 +1021,8 @@ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); - store_u8_4x1(src - 2 * stride, p1q1, 0); - store_u8_4x1(src - 1 * stride, p0q0, 0); - store_u8_4x1(src + 0 * stride, p0q0, 1); - store_u8_4x1(src + 1 * stride, p1q1, 1); + store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); + store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); } void aom_lpf_horizontal_4_dual_neon(
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h index b86397f..fa571a6 100644 --- a/aom_dsp/arm/mem_neon.h +++ b/aom_dsp/arm/mem_neon.h
@@ -174,6 +174,16 @@ *s3 = vld1_u8(s); } +static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); +} + static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3) { @@ -221,6 +231,16 @@ *s1 = vld1q_u16(s); } +static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); +} + static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2, uint16x8_t *const s3) { @@ -459,17 +479,15 @@ *s3 = vld1_s16(s); } -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define store_u8_2x1(s, s0, lane) \ - do { \ - vst1_lane_u16((uint16_t *)(s), vreinterpret_u16_u8(s0), lane); \ - } while (0) - -#define store_u8_4x1(s, s0, lane) \ - do { \ - vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \ - } while (0) +static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); +} static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, @@ -505,7 +523,7 @@ vst1_u8(s, s3); } -static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, +static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, const uint8x16_t s3) { vst1q_u8(s, s0); @@ -539,6 +557,16 @@ vst1q_u16(s, s7); } +static INLINE void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2) { + vst1_u16(s, s0); + s += dst_stride; + vst1_u16(s, s1); + s += dst_stride; + vst1_u16(s, s2); +} + static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2, const uint16x4_t s3) { @@ -558,6 +586,16 @@ vst1q_u16(s, s1); } +static INLINE void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); + s += dst_stride; + vst1q_u16(s, s2); +} + static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1, const uint16x8_t s2, const uint16x8_t s3) { @@ -604,20 +642,27 @@ vst1_s16(s, s3); } -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define store_s16_2x1(s, s0, lane) \ - do { \ - vst1_lane_s32((int32_t *)(s), vreinterpret_s32_s16(s0), lane); \ - } while (0) -#define store_u16_2x1(s, s0, lane) \ - do { \ - vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u16(s0), lane); \ - } while (0) -#define store_u16q_2x1(s, s0, lane) \ - do { \ - vst1q_lane_u32((uint32_t *)(s), vreinterpretq_u32_u16(s0), lane); \ - } while (0) +static INLINE void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride, + const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7) { + vst1_s16(s, s0); + s += dst_stride; + vst1_s16(s, s1); + s += dst_stride; + vst1_s16(s, s2); + s += dst_stride; + vst1_s16(s, s3); + s += dst_stride; + vst1_s16(s, s4); + s += dst_stride; + vst1_s16(s, s5); + s += dst_stride; + vst1_s16(s, s6); + s += dst_stride; + vst1_s16(s, s7); +} static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, const int16x8_t s0, const int16x8_t s1, @@ -631,6 +676,13 @@ vst1q_s16(s, s3); } +static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride, + const int16x8_t s0, const int16x8_t s1) { + vst1q_s16(s, s0); + s += dst_stride; + vst1q_s16(s, s1); +} + static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, @@ -886,6 +938,16 @@ *s3 = vld1q_s16(s); } +static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); +} + // Load 2 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { uint32_t a; @@ -991,36 +1053,6 @@ load_unaligned_u8_4x4(buf, stride, tu2, tu3); } -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define store_unaligned_u8_4x1(dst, src, lane) \ - do { \ - uint32_t a; \ - a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ - memcpy(dst, &a, 4); \ - } while (0) - -#define store_unaligned_u8_2x1(dst, src, lane) \ - do { \ - uint16_t a; \ - a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ - memcpy(dst, &a, 2); \ - } while (0) - -#define store_unaligned_u16_2x1(dst, src, lane) \ - do { \ - uint32_t a; \ - a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ - memcpy(dst, &a, 4); \ - } while (0) - -#define store_unaligned_u16_4x1(dst, src, lane) \ - do { \ - uint64_t a; \ - a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ - memcpy(dst, &a, 8); \ - } while (0) - static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3, @@ -1043,6 +1075,21 @@ *s7 = vld1q_u8(s); } +static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); +} + static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3) { @@ -1055,6 +1102,16 @@ *s3 = vld1q_u8(s); } +static INLINE void load_u8_16x3(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); +} + static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, @@ -1202,32 +1259,126 @@ vst1q_s32(buf, v0); } -static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride, - uint8x8_t src) { - store_unaligned_u8_2x1(dst, src, 0); - dst += dst_stride; - store_unaligned_u8_2x1(dst, src, 1); +static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src, + int16x8_t indices) { + // Recent Clang and GCC versions correctly identify that this zero-broadcast + // is redundant. Alternatively we could load and broadcast the zeroth element + // and then replace the other lanes, however this is slower than loading a + // single element without broadcast on some micro-architectures. + uint8x8_t ret = vdup_n_u8(0); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7); + return ret; } -static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride, - uint8x8_t src) { - store_unaligned_u8_4x1(dst, src, 0); - dst += dst_stride; - store_unaligned_u8_4x1(dst, src, 1); +// The `lane` parameter here must be an immediate. +#define store_u8_2x1_lane(dst, src, lane) \ + do { \ + uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ + memcpy(dst, &a, 2); \ + } while (0) + +#define store_u8_4x1_lane(dst, src, lane) \ + do { \ + uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +#define store_u16_2x1_lane(dst, src, lane) \ + do { \ + uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +#define store_u16_4x1_lane(dst, src, lane) \ + do { \ + uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ + memcpy(dst, &a, 8); \ + } while (0) + +#define store_s16_4x1_lane(dst, src, lane) \ + do { \ + int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \ + memcpy(dst, &a, 8); \ + } while (0) + +// Store the low 16-bits from a single vector. +static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) { + store_u8_2x1_lane(dst, src, 0); } -static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride, - uint16x4_t src) { - store_unaligned_u16_2x1(dst, src, 0); - dst += dst_stride; - store_unaligned_u16_2x1(dst, src, 1); +// Store the low 32-bits from a single vector. +static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) { + store_u8_4x1_lane(dst, src, 0); } -static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride, - uint16x8_t src) { - store_unaligned_u16_4x1(dst, src, 0); +// Store two blocks of 16-bits from a single vector. +static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride, + uint8x8_t src) { + store_u8_2x1_lane(dst, src, 0); dst += dst_stride; - store_unaligned_u16_4x1(dst, src, 1); + store_u8_2x1_lane(dst, src, 1); } +// Store two blocks of 32-bits from a single vector. +static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride, + uint8x8_t src) { + store_u8_4x1_lane(dst, src, 0); + dst += stride; + store_u8_4x1_lane(dst, src, 1); +} + +// Store four blocks of 32-bits from a single vector. +static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride, + uint8x16_t src) { + store_u8_4x1_lane(dst, vget_low_u8(src), 0); + dst += stride; + store_u8_4x1_lane(dst, vget_low_u8(src), 1); + dst += stride; + store_u8_4x1_lane(dst, vget_high_u8(src), 0); + dst += stride; + store_u8_4x1_lane(dst, vget_high_u8(src), 1); +} + +// Store the low 32-bits from a single vector. +static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) { + store_u16_2x1_lane(dst, src, 0); +} + +// Store two blocks of 32-bits from a single vector. +static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride, + uint16x4_t src) { + store_u16_2x1_lane(dst, src, 0); + dst += dst_stride; + store_u16_2x1_lane(dst, src, 1); +} + +// Store two blocks of 64-bits from a single vector. +static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride, + uint16x8_t src) { + store_u16_4x1_lane(dst, src, 0); + dst += dst_stride; + store_u16_4x1_lane(dst, src, 1); +} + +// Store two blocks of 64-bits from a single vector. +static INLINE void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride, + int16x8_t src) { + store_s16_4x1_lane(dst, src, 0); + dst += dst_stride; + store_s16_4x1_lane(dst, src, 1); +} + +#undef store_u8_2x1_lane +#undef store_u8_4x1_lane +#undef store_u16_2x1_lane +#undef store_u16_4x1_lane +#undef store_s16_4x1_lane + #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/reinterpret_neon.h b/aom_dsp/arm/reinterpret_neon.h new file mode 100644 index 0000000..f970251 --- /dev/null +++ b/aom_dsp/arm/reinterpret_neon.h
@@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ +#define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ + +#include <arm_neon.h> + +#include "aom/aom_integer.h" // For AOM_FORCE_INLINE. +#include "config/aom_config.h" + +#define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q) \ + static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t \ + aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n( \ + const u##int##from_sz##x##from_count##x##n##_t src) { \ + u##int##to_sz##x##to_count##x##n##_t ret; \ + for (int i = 0; i < (n); ++i) { \ + ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \ + } \ + return ret; \ + } + +REINTERPRET_NEON(u, 8, 8, 16, 4, 2, ) // uint8x8x2_t from uint16x4x2_t +REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q) // uint8x16x2_t from uint16x8x2_t + +#endif // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
diff --git a/aom_dsp/arm/subtract_neon.c b/aom_dsp/arm/subtract_neon.c index a195c40d..01ae835 100644 --- a/aom_dsp/arm/subtract_neon.c +++ b/aom_dsp/arm/subtract_neon.c
@@ -12,6 +12,7 @@ #include <arm_neon.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h"
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h index b5a8b97..30a108e 100644 --- a/aom_dsp/arm/sum_neon.h +++ b/aom_dsp/arm/sum_neon.h
@@ -17,6 +17,16 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" +static INLINE int horizontal_add_u8x8(const uint8x8_t a) { +#if AOM_ARCH_AARCH64 + return vaddlv_u8(a); +#else + uint16x4_t b = vpaddl_u8(a); + uint32x2_t c = vpaddl_u16(b); + return vget_lane_u32(c, 0) + vget_lane_u32(c, 1); +#endif +} + static INLINE int horizontal_add_s16x8(const int16x8_t a) { #if AOM_ARCH_AARCH64 return vaddlvq_s16(a); @@ -186,6 +196,23 @@ #endif } +static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) { +#if AOM_ARCH_AARCH64 + const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]); + const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]); + const int16x8_t b0 = vpaddq_s16(a0, a1); + return vpaddlq_s16(b0); +#else + const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0])); + const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1])); + const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2])); + const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3])); + const int16x4_t b0 = vpadd_s16(a0, a1); + const int16x4_t b1 = vpadd_s16(a2, a3); + return vpaddlq_s16(vcombine_s16(b0, b1)); +#endif +} + static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) { #if AOM_ARCH_AARCH64 return vaddv_u32(a);
diff --git a/aom_dsp/arm/sum_squares_sve.c b/aom_dsp/arm/sum_squares_sve.c new file mode 100644 index 0000000..c7e6dfc --- /dev/null +++ b/aom_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,402 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src, + int stride, int height) { + int64x2_t sum_squares = vdupq_n_s64(0); + + do { + int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); + + sum_squares = aom_sdotq_s16(sum_squares, s, s); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + return (uint64_t)vaddvq_s64(sum_squares); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src, + int stride, int height) { + int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int16x8_t s0 = vld1q_s16(src + 0 * stride); + int16x8_t s1 = vld1q_s16(src + 1 * stride); + + sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); + sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); + return (uint64_t)vaddvq_s64(sum_squares[0]); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src, + int stride, int width, + int height) { + int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16_t *src_ptr = src; + int w = width; + do { + int16x8_t s0 = vld1q_s16(src_ptr); + int16x8_t s1 = vld1q_s16(src_ptr + 8); + + sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); + sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); + + src_ptr += 16; + w -= 16; + } while (w != 0); + + src += stride; + } while (--height != 0); + + sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); + return (uint64_t)vaddvq_s64(sum_squares[0]); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src, + int stride, int width, + int height) { + svint64_t sum_squares = svdup_n_s64(0); + uint64_t step = svcnth(); + + do { + const int16_t *src_ptr = src; + int w = 0; + do { + svbool_t pred = svwhilelt_b16_u32(w, width); + svint16_t s0 = svld1_s16(pred, src_ptr); + + sum_squares = svdot_s64(sum_squares, s0, s0); + + src_ptr += step; + w += step; + } while (w < width); + + src += stride; + } while (--height != 0); + + return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares); +} + +uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width, + int height) { + if (width == 4) { + return aom_sum_squares_2d_i16_4xh_sve(src, stride, height); + } + if (width == 8) { + return aom_sum_squares_2d_i16_8xh_sve(src, stride, height); + } + if (width % 16 == 0) { + return aom_sum_squares_2d_i16_large_sve(src, stride, width, height); + } + return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height); +} + +uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) { + // This function seems to be called only for values of N >= 64. See + // av1/encoder/compound_type.c. Additionally, because N = width x height for + // width and height between the standard block sizes, N will also be a + // multiple of 64. + if (LIKELY(n % 64 == 0)) { + int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int16x8_t s0 = vld1q_s16(src); + int16x8_t s1 = vld1q_s16(src + 8); + int16x8_t s2 = vld1q_s16(src + 16); + int16x8_t s3 = vld1q_s16(src + 24); + + sum[0] = aom_sdotq_s16(sum[0], s0, s0); + sum[1] = aom_sdotq_s16(sum[1], s1, s1); + sum[2] = aom_sdotq_s16(sum[2], s2, s2); + sum[3] = aom_sdotq_s16(sum[3], s3, s3); + + src += 32; + n -= 32; + } while (n != 0); + + sum[0] = vaddq_s64(sum[0], sum[1]); + sum[2] = vaddq_s64(sum[2], sum[3]); + sum[0] = vaddq_s64(sum[0], sum[2]); + return vaddvq_s64(sum[0]); + } + return aom_sum_squares_i16_c(src, n); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src, + int stride, int height, + int *sum) { + int64x2_t sse = vdupq_n_s64(0); + int32x4_t sum_s32 = vdupq_n_s32(0); + + do { + int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); + + sse = aom_sdotq_s16(sse, s, s); + + sum_s32 = vpadalq_s16(sum_s32, s); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + *sum += vaddvq_s32(sum_s32); + return vaddvq_s64(sse); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src, + int stride, int height, + int *sum) { + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int16x8_t s0 = vld1q_s16(src); + int16x8_t s1 = vld1q_s16(src + stride); + + sse[0] = aom_sdotq_s16(sse[0], s0, s0); + sse[1] = aom_sdotq_s16(sse[1], s1, s1); + + sum_acc[0] = vpadalq_s16(sum_acc[0], s0); + sum_acc[1] = vpadalq_s16(sum_acc[1], s1); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); + return vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src, + int stride, int width, + int height, int *sum) { + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int w = 0; + do { + int16x8_t s0 = vld1q_s16(src + w); + int16x8_t s1 = vld1q_s16(src + w + 8); + + sse[0] = aom_sdotq_s16(sse[0], s0, s0); + sse[1] = aom_sdotq_s16(sse[1], s1, s1); + + sum_acc[0] = vpadalq_s16(sum_acc[0], s0); + sum_acc[1] = vpadalq_s16(sum_acc[1], s1); + + w += 16; + } while (w < width); + + src += stride; + } while (--height != 0); + + *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); + return vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width, + int height, int *sum) { + uint64_t sse; + + if (width == 4) { + sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum); + } else if (width == 8) { + sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum); + } else if (width % 16 == 0) { + sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum); + } else { + sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum); + } + + return sse; +} + +static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + uint16x8_t s0 = + vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride)); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + sse_u64 = aom_udotq_u16(sse_u64, s0, s0); + + src_u16 += 2 * src_stride; + h -= 2; + } while (h != 0); + + sum += vaddlvq_u32(sum_u32); + sse += vaddvq_u64(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + sse_u64 = aom_udotq_u16(sse_u64, s0, s0); + + src_ptr += 8; + w -= 8; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum += vaddlvq_u32(sum_u32); + sse += vaddvq_u64(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s1); + + sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); + sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); + + src_ptr += 16; + w -= 16; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); + + sum += vaddlvq_u32(sum_u32[0]); + sse += vaddvq_u64(sse_u64[0]); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + uint16x8_t s2 = vld1q_u16(src_ptr + 16); + uint16x8_t s3 = vld1q_u16(src_ptr + 24); + + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s1); + sum_u32[2] = vpadalq_u16(sum_u32[2], s2); + sum_u32[3] = vpadalq_u16(sum_u32[3], s3); + + sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); + sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); + sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2); + sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3); + + src_ptr += 32; + w -= 32; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); + sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]); + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); + sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]); + + sum += vaddlvq_u32(sum_u32[0]); + sse += vaddvq_u64(sse_u64[0]); + + return sse - sum * sum / (width * height); +} + +uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width, + int height) { + if (width == 4) { + return aom_var_2d_u16_4xh_sve(src, src_stride, width, height); + } + if (width == 8) { + return aom_var_2d_u16_8xh_sve(src, src_stride, width, height); + } + if (width == 16) { + return aom_var_2d_u16_16xh_sve(src, src_stride, width, height); + } + if (width % 32 == 0) { + return aom_var_2d_u16_large_sve(src, src_stride, width, height); + } + return aom_var_2d_u16_neon(src, src_stride, width, height); +}
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h index b215f6a..9fc4fb0 100644 --- a/aom_dsp/arm/transpose_neon.h +++ b/aom_dsp/arm/transpose_neon.h
@@ -16,11 +16,11 @@ #include "aom/aom_integer.h" // For AOM_FORCE_INLINE. #include "config/aom_config.h" -static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, - uint8x8_t *a2, uint8x8_t *a3, - uint8x8_t *a4, uint8x8_t *a5, - uint8x8_t *a6, - uint8x8_t *a7) { +static INLINE void transpose_elems_u8_8x8( + uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4, + uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1, + uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, + uint8x8_t *o7) { // Swap 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 @@ -36,10 +36,8 @@ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 - const uint8x16x2_t b0 = - vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); - const uint8x16x2_t b1 = - vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); + const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5)); + const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7)); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 @@ -62,14 +60,235 @@ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); - *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); - *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); - *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); - *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); - *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); - *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); - *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); - *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); + *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + +static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, + uint8x8_t *a2, uint8x8_t *a3, + uint8x8_t *a4, uint8x8_t *a5, + uint8x8_t *a6, + uint8x8_t *a7) { + transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3, + a4, a5, a6, a7); +} + +static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in, + uint8x8_t *out) { + transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + &out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x, + uint8x16_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); + uint8x8x2_t w2 = vzip_u8(x[4], x[5]); + uint8x8x2_t w3 = vzip_u8(x[6], x[7]); + + uint8x8x2_t w8 = vzip_u8(x[8], x[9]); + uint8x8x2_t w9 = vzip_u8(x[10], x[11]); + uint8x8x2_t w10 = vzip_u8(x[12], x[13]); + uint8x8x2_t w11 = vzip_u8(x[14], x[15]); + + uint16x4x2_t w4 = + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); + uint16x4x2_t w5 = + vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); + uint16x4x2_t w12 = + vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); + uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), + vreinterpret_u16_u8(w11.val[0])); + + uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), + vreinterpret_u32_u16(w5.val[0])); + uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), + vreinterpret_u32_u16(w5.val[1])); + uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), + vreinterpret_u32_u16(w13.val[0])); + uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), + vreinterpret_u32_u16(w13.val[1])); + + // Store first 4-line result + d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); + d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); + d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); + d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); + + w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); + w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); + w12 = + vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); + w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), + vreinterpret_u16_u8(w11.val[1])); + + w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), + vreinterpret_u32_u16(w5.val[0])); + w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), + vreinterpret_u32_u16(w5.val[1])); + w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), + vreinterpret_u32_u16(w13.val[0])); + w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), + vreinterpret_u32_u16(w13.val[1])); + + // Store second 4-line result + d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); + d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); + d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); + d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x, + uint8x8_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); + uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); + + uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0])); + uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), + vreinterpretq_u16_u8(w3.val[0])); + uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1])); + uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), + vreinterpretq_u16_u8(w3.val[1])); + + uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), + vreinterpretq_u32_u16(w5.val[0])); + uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), + vreinterpretq_u32_u16(w7.val[0])); + uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), + vreinterpretq_u32_u16(w5.val[1])); + uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), + vreinterpretq_u32_u16(w7.val[1])); + + d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0])); + d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0])); + d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1])); + d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1])); + d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0])); + d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0])); + d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1])); + d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1])); + d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0])); + d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0])); + d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1])); + d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1])); + d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0])); + d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0])); + d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1])); + d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1])); +} + +static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; +#if AOM_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); +#endif + return b0; +} + +static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x, + uint8x16_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); + uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); + + uint8x16x2_t w4 = vzipq_u8(x[8], x[9]); + uint8x16x2_t w5 = vzipq_u8(x[10], x[11]); + uint8x16x2_t w6 = vzipq_u8(x[12], x[13]); + uint8x16x2_t w7 = vzipq_u8(x[14], x[15]); + + uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0])); + uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), + vreinterpretq_u16_u8(w3.val[0])); + uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), + vreinterpretq_u16_u8(w5.val[0])); + uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), + vreinterpretq_u16_u8(w7.val[0])); + + uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), + vreinterpretq_u32_u16(w9.val[0])); + uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), + vreinterpretq_u32_u16(w11.val[0])); + uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), + vreinterpretq_u32_u16(w9.val[1])); + uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), + vreinterpretq_u32_u16(w11.val[1])); + + uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); + d[0] = vreinterpretq_u8_u16(d01.val[0]); + d[1] = vreinterpretq_u8_u16(d01.val[1]); + uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); + d[2] = vreinterpretq_u8_u16(d23.val[0]); + d[3] = vreinterpretq_u8_u16(d23.val[1]); + uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); + d[4] = vreinterpretq_u8_u16(d45.val[0]); + d[5] = vreinterpretq_u8_u16(d45.val[1]); + uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); + d[6] = vreinterpretq_u8_u16(d67.val[0]); + d[7] = vreinterpretq_u8_u16(d67.val[1]); + + // upper half + w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1])); + w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), + vreinterpretq_u16_u8(w3.val[1])); + w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), + vreinterpretq_u16_u8(w5.val[1])); + w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), + vreinterpretq_u16_u8(w7.val[1])); + + w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), + vreinterpretq_u32_u16(w9.val[0])); + w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), + vreinterpretq_u32_u16(w11.val[0])); + w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), + vreinterpretq_u32_u16(w9.val[1])); + w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), + vreinterpretq_u32_u16(w11.val[1])); + + d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); + d[8] = vreinterpretq_u8_u16(d01.val[0]); + d[9] = vreinterpretq_u8_u16(d01.val[1]); + d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); + d[10] = vreinterpretq_u8_u16(d23.val[0]); + d[11] = vreinterpretq_u8_u16(d23.val[1]); + d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); + d[12] = vreinterpretq_u8_u16(d45.val[0]); + d[13] = vreinterpretq_u8_u16(d45.val[1]); + d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); + d[14] = vreinterpretq_u8_u16(d67.val[0]); + d[15] = vreinterpretq_u8_u16(d67.val[1]); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x, + uint8x16_t *d) { + uint8x16_t x2[32]; + for (int i = 0; i < 16; ++i) { + x2[i] = x[i].val[0]; + x2[i + 16] = x[i].val[1]; + } + transpose_arrays_u8_16x16(x2, d); + transpose_arrays_u8_16x16(x2 + 16, d + 16); } static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, @@ -106,6 +325,41 @@ *a3 = vreinterpret_u8_u16(c1.val[1]); } +static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0, + uint8x16_t *a1, + uint8x16_t *a2, + uint8x16_t *a3) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015 + // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115 + // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215 + // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114 + // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115 + // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314 + // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315 + + const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1); + const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 012 112 212 312 + // c0.val[1]: 02 12 22 32 06 16 26 36 09 19 29 39 013 113 213 313 + // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314 + // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315 + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + *a0 = vreinterpretq_u8_u16(c0.val[0]); + *a1 = vreinterpretq_u8_u16(c1.val[0]); + *a2 = vreinterpretq_u8_u16(c0.val[1]); + *a3 = vreinterpretq_u8_u16(c1.val[1]); +} + static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { // Swap 16 bit elements. Goes from: @@ -265,22 +519,6 @@ a[3] = vreinterpretq_u16_u32(c1.val[1]); } -static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { - uint16x8x2_t b0; -#if AOM_ARCH_AARCH64 - b0.val[0] = vreinterpretq_u16_u64( - vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); - b0.val[1] = vreinterpretq_u16_u64( - vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); -#else - b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), - vreinterpret_u16_u32(vget_low_u32(a1))); - b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), - vreinterpret_u16_u32(vget_high_u32(a1))); -#endif - return b0; -} - // Special transpose for loop filter. // 4x8 Input: // p_q: p3 p2 p1 p0 q0 q1 q2 q3 @@ -682,6 +920,40 @@ out[7] = d3.val[1]; } +static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, + int16x8_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); + const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 01 11 21 31 05 15 25 35 + // c1.val[0]: 02 12 22 32 06 16 26 36 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + + *a0 = vreinterpretq_s16_s32(c0.val[0]); + *a1 = vreinterpretq_s16_s32(c1.val[0]); + *a2 = vreinterpretq_s16_s32(c0.val[1]); + *a3 = vreinterpretq_s16_s32(c1.val[1]); +} + static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0, uint16x4_t *a1, uint16x4_t *a2,
diff --git a/aom_dsp/bitwriter.c b/aom_dsp/bitwriter.c index 23d28a1..4c27bb1 100644 --- a/aom_dsp/bitwriter.c +++ b/aom_dsp/bitwriter.c
@@ -23,6 +23,10 @@ uint32_t bytes; unsigned char *data; data = od_ec_enc_done(&w->ec, &bytes); + if (!data) { + od_ec_enc_clear(&w->ec); + return -1; + } nb_bits = od_ec_enc_tell(&w->ec); memcpy(w->buffer, data, bytes); w->pos = bytes;
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h index fb33909..6aedd8c 100644 --- a/aom_dsp/bitwriter.h +++ b/aom_dsp/bitwriter.h
@@ -62,6 +62,8 @@ void aom_start_encode(aom_writer *w, uint8_t *buffer); +// Returns a negative number on error. Caller must check the return value and +// handle error. int aom_stop_encode(aom_writer *w); int aom_tell_size(aom_writer *w);
diff --git a/aom_dsp/entenc.c b/aom_dsp/entenc.c index dfc1624..591e0ad 100644 --- a/aom_dsp/entenc.c +++ b/aom_dsp/entenc.c
@@ -58,6 +58,7 @@ int d; int c; int s; + if (enc->error) return; c = enc->cnt; assert(rng <= 65535U); /*The number of leading zeros in the 16-bit binary representation of rng.*/ @@ -83,7 +84,6 @@ out = (unsigned char *)realloc(out, sizeof(*out) * storage); if (out == NULL) { enc->error = -1; - enc->offs = 0; return; } enc->buf = out; @@ -372,28 +372,3 @@ uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); } - -/*Saves a entropy coder checkpoint to dst. - This allows an encoder to reverse a series of entropy coder - decisions if it decides that the information would have been - better coded some other way.*/ -void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) { - OD_COPY(dst, src, 1); -} - -/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint. - This can only be used to restore from checkpoints earlier in the target - state's history: you can not switch backwards and forwards or otherwise - switch to a state which isn't a casual ancestor of the current state. - Restore is also incompatible with patching the initial bits, as the - changes will remain in the restored version.*/ -void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { - unsigned char *buf; - uint32_t storage; - assert(dst->storage >= src->storage); - buf = dst->buf; - storage = dst->storage; - OD_COPY(dst, src, 1); - dst->buf = buf; - dst->storage = storage; -}
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h index d26f027..1a38aff 100644 --- a/aom_dsp/entenc.h +++ b/aom_dsp/entenc.h
@@ -74,9 +74,6 @@ OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) OD_ARG_NONNULL(1); -void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src); -void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); - // buf is the frame bitbuffer, offs is where carry to be added static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) { uint16_t sum, carry = 1;
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.c b/aom_dsp/flow_estimation/arm/disflow_neon.c index f091366..5758d28 100644 --- a/aom_dsp/flow_estimation/arm/disflow_neon.c +++ b/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -16,36 +16,10 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/flow_estimation/arm/disflow_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" -static INLINE void get_cubic_kernel_dbl(double x, double *kernel) { - // Check that the fractional position is in range. - // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); - - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; -} - -static INLINE void get_cubic_kernel_int(double x, int *kernel) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between @@ -157,82 +131,6 @@ } } -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - - // Horizontal filter, using kernel {1, 0, -1}. - const uint8_t *src_start = src - 1 * src_stride - 1; - - for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { - uint8x16_t s = vld1q_u8(src_start + i * src_stride); - uint8x8_t s0 = vget_low_u8(s); - uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); - - // Given that the kernel is {1, 0, -1} the convolution is a simple - // subtraction. - int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); - - vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); - } - - // Vertical filter, using kernel {1, 2, 1}. - // This kernel can be split into two 2-taps kernels of value {1, 1}. - // That way we need only 3 add operations to perform the convolution, one of - // which can be reused for the next line. - int16x8_t s0 = vld1q_s16(tmp); - int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); - int16x8_t sum01 = vaddq_s16(s0, s1); - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); - - int16x8_t sum12 = vaddq_s16(s1, s2); - int16x8_t sum = vaddq_s16(sum01, sum12); - - vst1q_s16(dst + i * dst_stride, sum); - - sum01 = sum12; - s1 = s2; - } -} - -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - - // Horizontal filter, using kernel {1, 2, 1}. - // This kernel can be split into two 2-taps kernels of value {1, 1}. - // That way we need only 3 add operations to perform the convolution. - const uint8_t *src_start = src - 1 * src_stride - 1; - - for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { - uint8x16_t s = vld1q_u8(src_start + i * src_stride); - uint8x8_t s0 = vget_low_u8(s); - uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); - uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); - - uint16x8_t sum01 = vaddl_u8(s0, s1); - uint16x8_t sum12 = vaddl_u8(s1, s2); - uint16x8_t sum = vaddq_u16(sum01, sum12); - - vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); - } - - // Vertical filter, using kernel {1, 0, -1}. - // Load the whole block at once to avoid redundant loads during convolution. - int16x8_t t[10]; - load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], - &t[5], &t[6], &t[7], &t[8], &t[9]); - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - // Given that the kernel is {1, 0, -1} the convolution is a simple - // subtraction. - int16x8_t diff = vsubq_s16(t[i], t[i + 2]); - - vst1q_s16(dst + i * dst_stride, diff); - } -} - // Computes the components of the system of equations used to solve for // a flow vector. //
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.h b/aom_dsp/flow_estimation/arm/disflow_neon.h new file mode 100644 index 0000000..d991a13 --- /dev/null +++ b/aom_dsp/flow_estimation/arm/disflow_neon.h
@@ -0,0 +1,127 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include <arm_neon.h> +#include <math.h> + +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { + // Check that the fractional position is in range. + // + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. + // Mathematically, this implies that 0 <= x < 1. However, in practice it is + // possible to have x == 1 due to floating point rounding. This is fine, + // and we still interpolate correctly if we allow x = 1. + assert(0 <= x && x <= 1); + + double x2 = x * x; + double x3 = x2 * x; + kernel[0] = -0.5 * x + x2 - 0.5 * x3; + kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; + kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; + kernel[3] = -0.5 * x2 + 0.5 * x3; +} + +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { + double kernel_dbl[4]; + get_cubic_kernel_dbl(x, kernel_dbl); + + kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); + kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); + kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); + kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); +} + +static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 0, -1}. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); + } + + // Vertical filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution, one of + // which can be reused for the next line. + int16x8_t s0 = vld1q_s16(tmp); + int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); + int16x8_t sum01 = vaddq_s16(s0, s1); + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); + + int16x8_t sum12 = vaddq_s16(s1, s2); + int16x8_t sum = vaddq_s16(sum01, sum12); + + vst1q_s16(dst + i * dst_stride, sum); + + sum01 = sum12; + s1 = s2; + } +} + +static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + uint16x8_t sum01 = vaddl_u8(s0, s1); + uint16x8_t sum12 = vaddl_u8(s1, s2); + uint16x8_t sum = vaddq_u16(sum01, sum12); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); + } + + // Vertical filter, using kernel {1, 0, -1}. + // Load the whole block at once to avoid redundant loads during convolution. + int16x8_t t[10]; + load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], + &t[5], &t[6], &t[7], &t[8], &t[9]); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vsubq_s16(t[i], t[i + 2]); + + vst1q_s16(dst + i * dst_stride, diff); + } +} + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_
diff --git a/aom_dsp/flow_estimation/arm/disflow_sve.c b/aom_dsp/flow_estimation/arm/disflow_sve.c new file mode 100644 index 0000000..7b01e90 --- /dev/null +++ b/aom_dsp/flow_estimation/arm/disflow_sve.c
@@ -0,0 +1,268 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include <arm_neon.h> +#include <arm_sve.h> +#include <math.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/flow_estimation/arm/disflow_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, int16_t *dt) { + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + int h_kernel[4]; + int v_kernel[4]; + get_cubic_kernel_int(u_frac, h_kernel); + get_cubic_kernel_int(v_frac, v_kernel); + + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution. + const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1); + const int16x4_t h_kernel_s16 = vmovn_s32(vld1q_s32(h_kernel)); + const int16x8_t h_filter = vcombine_s16(h_kernel_s16, vdup_n_s16(0)); + const uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) { + svuint16_t r0 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 0); + svuint16_t r1 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 1); + svuint16_t r2 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 2); + svuint16_t r3 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 3); + + int16x8_t s0 = vreinterpretq_s16_u16(svget_neonq_u16(r0)); + int16x8_t s1 = vreinterpretq_s16_u16(svget_neonq_u16(r1)); + int16x8_t s2 = vreinterpretq_s16_u16(svget_neonq_u16(r2)); + int16x8_t s3 = vreinterpretq_s16_u16(svget_neonq_u16(r3)); + + int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s0, h_filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s1, h_filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s2, h_filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s3, h_filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + // 6 is the maximum allowable number of extra bits which will avoid + // the intermediate values overflowing an int16_t. The most extreme + // intermediate value occurs when: + // * The input pixels are [0, 255, 255, 0] + // * u_frac = 0.5 + // In this case, the un-scaled output is 255 * 1.125 = 286.875. + // As an integer with 6 fractional bits, that is 18360, which fits + // in an int16_t. But with 7 fractional bits it would be 36720, + // which is too large. + int16x8_t res = vcombine_s16(vrshrn_n_s32(res0, DISFLOW_INTERP_BITS - 6), + vrshrn_n_s32(res1, DISFLOW_INTERP_BITS - 6)); + + res = aom_tbl_s16(res, idx); + + vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, res); + } + + // Vertical convolution. + int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel)); + int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { + int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE); + int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE); + int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE); + int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE); + + int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3); + + int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3); + + uint8x8_t s = vld1_u8(src + (i + y) * stride + x); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3)); + + // This time, we have to round off the 6 extra bits which were kept + // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits + // of precision to match the scale of the dx and dy arrays. + sum_lo = vrshrq_n_s32(sum_lo, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + sum_hi = vrshrq_n_s32(sum_hi, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16)); + int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16)); + vst1q_s16(dt + i * DISFLOW_PATCH_SIZE, + vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi))); + } +} + +// Computes the components of the system of equations used to solve for +// a flow vector. +// +// The flow equations are a least-squares system, derived as follows: +// +// For each pixel in the patch, we calculate the current error `dt`, +// and the x and y gradients `dx` and `dy` of the source patch. +// This means that, to first order, the squared error for this pixel is +// +// (dt + u * dx + v * dy)^2 +// +// where (u, v) are the incremental changes to the flow vector. +// +// We then want to find the values of u and v which minimize the sum +// of the squared error across all pixels. Conveniently, this fits exactly +// into the form of a least squares problem, with one equation +// +// u * dx + v * dy = -dt +// +// for each pixel. +// +// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, +// and absorbing the - sign elsewhere, this results in the least squares system +// +// M = |sum(dx * dx) sum(dx * dy)| +// |sum(dx * dy) sum(dy * dy)| +// +// b = |sum(dx * dt)| +// |sum(dy * dt)| +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M_inv) { + int64x2_t sum[3] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t x = vld1q_s16(dx + i * dx_stride); + int16x8_t y = vld1q_s16(dy + i * dy_stride); + + sum[0] = aom_sdotq_s16(sum[0], x, x); + sum[1] = aom_sdotq_s16(sum[1], x, y); + sum[2] = aom_sdotq_s16(sum[2], y, y); + } + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[1], sum[2]); + int32x4_t res = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + + double M0 = (double)vgetq_lane_s32(res, 0) + 1; + double M1 = (double)vgetq_lane_s32(res, 1); + double M2 = (double)vgetq_lane_s32(res, 2); + double M3 = (double)vgetq_lane_s32(res, 3) + 1; + + // Invert matrix M. + double det = (M0 * M3) - (M1 * M2); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M3 * det_inv; + M_inv[1] = -M1 * det_inv; + M_inv[2] = -M2 * det_inv; + M_inv[3] = M0 * det_inv; +} + +static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + const int16_t *dt, int dt_stride, + int *b) { + int64x2_t b_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t dx16 = vld1q_s16(dx + i * dx_stride); + int16x8_t dy16 = vld1q_s16(dy + i * dy_stride); + int16x8_t dt16 = vld1q_s16(dt + i * dt_stride); + + b_s64[0] = aom_sdotq_s16(b_s64[0], dx16, dt16); + b_s64[1] = aom_sdotq_s16(b_s64[1], dy16, dt16); + } + + b_s64[0] = vpaddq_s64(b_s64[0], b_s64[1]); + vst1_s32(b, vmovn_s64(b_s64[0])); +} + +void aom_compute_flow_at_point_sve(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + double M_inv[4]; + int b[2]; + int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); + sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt); + compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt, + DISFLOW_PATCH_SIZE, b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +}
diff --git a/aom_dsp/flow_estimation/corner_detect.c b/aom_dsp/flow_estimation/corner_detect.c index 284d1bd..44d423d 100644 --- a/aom_dsp/flow_estimation/corner_detect.c +++ b/aom_dsp/flow_estimation/corner_detect.c
@@ -20,6 +20,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_mem/aom_mem.h" +#include "aom_util/aom_pthread.h" #include "av1/common/common.h" #define FAST_BARRIER 18 @@ -39,11 +40,24 @@ return corners; } -static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { - const uint8_t *buf = pyr->layers[0].buffer; - int width = pyr->layers[0].width; - int height = pyr->layers[0].height; - int stride = pyr->layers[0].stride; +static bool compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { + ImagePyramid *pyr = frame->y_pyramid; + const int layers = + aom_compute_pyramid(frame, bit_depth, downsample_level + 1, pyr); + + if (layers < 0) { + return false; + } + + // Clamp downsampling ratio base on max number of layers allowed + // for this frame size + downsample_level = layers - 1; + + const uint8_t *buf = pyr->layers[downsample_level].buffer; + int width = pyr->layers[downsample_level].width; + int height = pyr->layers[downsample_level].height; + int stride = pyr->layers[downsample_level].stride; int *scores = NULL; int num_corners; @@ -53,9 +67,11 @@ if (num_corners <= MAX_CORNERS) { // Use all detected corners - if (num_corners != 0) { - memcpy(corners->corners, frame_corners_xy, - sizeof(*frame_corners_xy) * num_corners); + for (int i = 0; i < num_corners; i++) { + corners->corners[2 * i + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * i + 1] = + frame_corners_xy[i].y * (1 << downsample_level); } corners->num_corners = num_corners; } else { @@ -85,8 +101,10 @@ for (int i = 0; i < num_corners; i++) { if (scores[i] > threshold) { assert(copied_corners < MAX_CORNERS); - corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x; - corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y; + corners->corners[2 * copied_corners + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * copied_corners + 1] = + frame_corners_xy[i].y * (1 << downsample_level); copied_corners += 1; } } @@ -99,7 +117,8 @@ return true; } -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { assert(corners); #if CONFIG_MULTITHREAD @@ -107,7 +126,8 @@ #endif // CONFIG_MULTITHREAD if (!corners->valid) { - corners->valid = compute_corner_list(pyr, corners); + corners->valid = + compute_corner_list(frame, bit_depth, downsample_level, corners); } bool valid = corners->valid;
diff --git a/aom_dsp/flow_estimation/corner_detect.h b/aom_dsp/flow_estimation/corner_detect.h index d05846c..54d9430 100644 --- a/aom_dsp/flow_estimation/corner_detect.h +++ b/aom_dsp/flow_estimation/corner_detect.h
@@ -18,7 +18,7 @@ #include <memory.h> #include "aom_dsp/pyramid.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,7 +57,8 @@ CornerList *av1_alloc_corner_list(void); -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners); +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners); #ifndef NDEBUG // Check if a corner list has already been computed.
diff --git a/aom_dsp/flow_estimation/corner_match.c b/aom_dsp/flow_estimation/corner_match.c index cef719b..c78edb8 100644 --- a/aom_dsp/flow_estimation/corner_match.c +++ b/aom_dsp/flow_estimation/corner_match.c
@@ -17,62 +17,84 @@ #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/corner_match.h" +#include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/pyramid.h" #include "aom_scale/yv12config.h" -#define SEARCH_SZ 9 -#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) - #define THRESHOLD_NCC 0.75 -/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame, - centered at (x, y). +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -static double compute_variance(const unsigned char *frame, int stride, int x, - int y) { +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, + int y, double *mean, double *one_over_stddev) { int sum = 0; int sumsq = 0; - int var; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; } - var = sumsq * MATCH_SZ_SQ - sum * sum; - return (double)var; + } + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; } -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. */ -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, - int x1, int y1, - const unsigned char *frame2, int stride2, - int x2, int y2) { +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { int v1, v2; - int sum1 = 0; - int sum2 = 0; - int sumsq2 = 0; int cross = 0; - int var2, cov; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; - sum1 += v1; - sum2 += v2; - sumsq2 += v2 * v2; cross += v1 * v2; } - var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + } + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + double covariance = cross - mean1 * mean2; + double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } static int is_eligible_point(int pointx, int pointy, int width, int height) { @@ -87,65 +109,14 @@ (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; } -static void improve_correspondence(const unsigned char *src, - const unsigned char *ref, int width, - int height, int src_stride, int ref_stride, - Correspondence *correspondences, - int num_correspondences) { - int i; - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - // For this algorithm, all points have integer coordinates. - // It's a little more efficient to convert them to ints once, - // before the inner loops - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue; - if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height)) - continue; - match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref, - ref_stride, rx0 + x, ry0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - } - correspondences[i].rx += best_x; - correspondences[i].ry += best_y; - } - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue; - if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height)) - continue; - match_ncc = av1_compute_cross_correlation( - ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - correspondences[i].x += best_x; - correspondences[i].y += best_y; - } -} +typedef struct { + int x; + int y; + double mean; + double one_over_stddev; + int best_match_idx; + double best_match_corr; +} PointInfo; static int determine_correspondence(const unsigned char *src, const int *src_corners, int num_src_corners, @@ -154,56 +125,136 @@ int width, int height, int src_stride, int ref_stride, Correspondence *correspondences) { - // TODO(sarahparker) Improve this to include 2-way match - int i, j; + PointInfo *src_point_info = NULL; + PointInfo *ref_point_info = NULL; int num_correspondences = 0; - for (i = 0; i < num_src_corners; ++i) { - double best_match_ncc = 0.0; - double template_norm; - int best_match_j = -1; - if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width, - height)) + + src_point_info = + (PointInfo *)aom_calloc(num_src_corners, sizeof(*src_point_info)); + if (!src_point_info) { + goto finished; + } + + ref_point_info = + (PointInfo *)aom_calloc(num_ref_corners, sizeof(*ref_point_info)); + if (!ref_point_info) { + goto finished; + } + + // First pass (linear): + // Filter corner lists and compute per-patch means and standard deviations, + // for the src and ref frames independently + int src_point_count = 0; + for (int i = 0; i < num_src_corners; i++) { + int src_x = src_corners[2 * i]; + int src_y = src_corners[2 * i + 1]; + if (!is_eligible_point(src_x, src_y, width, height)) continue; + + PointInfo *point = &src_point_info[src_point_count]; + point->x = src_x; + point->y = src_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(src, src_stride, src_x, src_y, &point->mean, + &point->one_over_stddev)) continue; - for (j = 0; j < num_ref_corners; ++j) { - double match_ncc; - if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, - height)) + src_point_count++; + } + if (src_point_count == 0) { + goto finished; + } + + int ref_point_count = 0; + for (int j = 0; j < num_ref_corners; j++) { + int ref_x = ref_corners[2 * j]; + int ref_y = ref_corners[2 * j + 1]; + if (!is_eligible_point(ref_x, ref_y, width, height)) continue; + + PointInfo *point = &ref_point_info[ref_point_count]; + point->x = ref_x; + point->y = ref_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(ref, ref_stride, ref_x, ref_y, &point->mean, + &point->one_over_stddev)) + continue; + ref_point_count++; + } + if (ref_point_count == 0) { + goto finished; + } + + // Second pass (quadratic): + // For each pair of points, compute correlation, and use this to determine + // the best match of each corner, in both directions + for (int i = 0; i < src_point_count; ++i) { + PointInfo *src_point = &src_point_info[i]; + for (int j = 0; j < ref_point_count; ++j) { + PointInfo *ref_point = &ref_point_info[j]; + if (!is_eligible_distance(src_point->x, src_point->y, ref_point->x, + ref_point->y, width, height)) continue; - if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1], - ref_corners[2 * j], ref_corners[2 * j + 1], - width, height)) - continue; - match_ncc = av1_compute_cross_correlation( - src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref, - ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_match_j = j; + + double corr = aom_compute_correlation( + src, src_stride, src_point->x, src_point->y, src_point->mean, + src_point->one_over_stddev, ref, ref_stride, ref_point->x, + ref_point->y, ref_point->mean, ref_point->one_over_stddev); + + if (corr > src_point->best_match_corr) { + src_point->best_match_idx = j; + src_point->best_match_corr = corr; + } + if (corr > ref_point->best_match_corr) { + ref_point->best_match_idx = i; + ref_point->best_match_corr = corr; } } - // Note: We want to test if the best correlation is >= THRESHOLD_NCC, - // but need to account for the normalization in - // av1_compute_cross_correlation. - template_norm = compute_variance(src, src_stride, src_corners[2 * i], - src_corners[2 * i + 1]); - if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { - correspondences[num_correspondences].x = src_corners[2 * i]; - correspondences[num_correspondences].y = src_corners[2 * i + 1]; - correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; - correspondences[num_correspondences].ry = - ref_corners[2 * best_match_j + 1]; + } + + // Third pass (linear): + // Scan through source corners, generating a correspondence for each corner + // iff ref_best_match[src_best_match[i]] == i + // Then refine the generated correspondences using optical flow + for (int i = 0; i < src_point_count; i++) { + PointInfo *point = &src_point_info[i]; + + // Skip corners which were not matched, or which didn't find + // a good enough match + if (point->best_match_corr < THRESHOLD_NCC) continue; + + PointInfo *match_point = &ref_point_info[point->best_match_idx]; + if (match_point->best_match_idx == i) { + // Refine match using optical flow and store + const int sx = point->x; + const int sy = point->y; + const int rx = match_point->x; + const int ry = match_point->y; + double u = (double)(rx - sx); + double v = (double)(ry - sy); + + const int patch_tl_x = sx - DISFLOW_PATCH_CENTER; + const int patch_tl_y = sy - DISFLOW_PATCH_CENTER; + + aom_compute_flow_at_point(src, ref, patch_tl_x, patch_tl_y, width, height, + src_stride, &u, &v); + + Correspondence *correspondence = &correspondences[num_correspondences]; + correspondence->x = (double)sx; + correspondence->y = (double)sy; + correspondence->rx = (double)sx + u; + correspondence->ry = (double)sy + v; num_correspondences++; } } - improve_correspondence(src, ref, width, height, src_stride, ref_stride, - correspondences, num_correspondences); + +finished: + aom_free(src_point_info); + aom_free(ref_point_info); return num_correspondences; } bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed) { + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { int num_correspondences; Correspondence *correspondences; ImagePyramid *src_pyramid = src->y_pyramid; @@ -212,19 +263,19 @@ CornerList *ref_corners = ref->corners; // Precompute information we will need about each frame - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + if (aom_compute_pyramid(src, bit_depth, 1, src_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (aom_compute_pyramid(ref, bit_depth, 1, ref_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, ref_corners)) { *mem_alloc_failed = true; return false; }
diff --git a/aom_dsp/flow_estimation/corner_match.h b/aom_dsp/flow_estimation/corner_match.h index 4435d2c..77ebee2 100644 --- a/aom_dsp/flow_estimation/corner_match.h +++ b/aom_dsp/flow_estimation/corner_match.h
@@ -25,14 +25,20 @@ extern "C" { #endif -#define MATCH_SZ 13 +#define MATCH_SZ 16 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) +// Minimum threshold for the variance of a patch, in order for it to be +// considered useful for matching. +// This is evaluated against the scaled variance MATCH_SZ_SQ * sigma^2, +// so a setting of 1 * MATCH_SZ_SQ corresponds to an unscaled variance of 1 +#define MIN_FEATURE_VARIANCE (1 * MATCH_SZ_SQ) + bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed); + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus }
diff --git a/aom_dsp/flow_estimation/disflow.c b/aom_dsp/flow_estimation/disflow.c index ed5559c..f511a6e 100644 --- a/aom_dsp/flow_estimation/disflow.c +++ b/aom_dsp/flow_estimation/disflow.c
@@ -24,24 +24,29 @@ #include "config/aom_dsp_rtcd.h" -// TODO(rachelbarker): -// Implement specialized functions for upscaling flow fields, -// replacing av1_upscale_plane_double_prec(). -// Then we can avoid needing to include code from av1/ -#include "av1/common/resize.h" - // Amount to downsample the flow field by. -// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate +// e.g., DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate // one flow point for each 4x4 pixel region of the frame // Must be a power of 2 #define DOWNSAMPLE_SHIFT 3 #define DOWNSAMPLE_FACTOR (1 << DOWNSAMPLE_SHIFT) + +// Filters used when upscaling the flow field from one pyramid level +// to another. See upscale_flow_component for details on kernel selection +#define FLOW_UPSCALE_TAPS 4 + // Number of outermost flow field entries (on each edge) which can't be // computed, because the patch they correspond to extends outside of the // frame // The border is (DISFLOW_PATCH_SIZE >> 1) pixels, which is // (DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT many flow field entries -#define FLOW_BORDER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT) +#define FLOW_BORDER_INNER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT) + +// Number of extra padding entries on each side of the flow field. +// These samples are added so that we do not need to apply clamping when +// interpolating or upsampling the flow field +#define FLOW_BORDER_OUTER (FLOW_UPSCALE_TAPS / 2) + // When downsampling the flow field, each flow field entry covers a square // region of pixels in the image pyramid. This value is equal to the position // of the center of that region, as an offset from the top/left edge. @@ -52,10 +57,16 @@ // this gives the correct offset of 0 instead of -1. #define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2) -static INLINE void get_cubic_kernel_dbl(double x, double *kernel) { +static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = { + // Cubic interpolation kernels for phase=0.75 and phase=0.25, respectively + { -3 / 128., 29 / 128., 111 / 128., -9 / 128. }, + { -9 / 128., 111 / 128., 29 / 128., -3 / 128. } +}; + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. @@ -69,7 +80,7 @@ kernel[3] = -0.5 * x2 + 0.5 * x3; } -static INLINE void get_cubic_kernel_int(double x, int *kernel) { +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { double kernel_dbl[4]; get_cubic_kernel_dbl(x, kernel_dbl); @@ -80,18 +91,19 @@ } static INLINE double get_cubic_value_dbl(const double *p, - const double *kernel) { + const double kernel[4]) { return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + kernel[3] * p[3]; } -static INLINE int get_cubic_value_int(const int *p, const int *kernel) { +static INLINE int get_cubic_value_int(const int *p, const int kernel[4]) { return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + kernel[3] * p[3]; } static INLINE double bicubic_interp_one(const double *arr, int stride, - double *h_kernel, double *v_kernel) { + const double h_kernel[4], + const double v_kernel[4]) { double tmp[1 * 4]; // Horizontal convolution @@ -103,7 +115,9 @@ return get_cubic_value_dbl(tmp, v_kernel); } -static int determine_disflow_correspondence(CornerList *corners, +static int determine_disflow_correspondence(const ImagePyramid *src_pyr, + const ImagePyramid *ref_pyr, + CornerList *corners, const FlowField *flow, Correspondence *correspondences) { const int width = flow->width; @@ -132,7 +146,15 @@ const double flow_sub_y = (y & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR; - // Make sure that bicubic interpolation won't read outside of the flow field + // Exclude points which would sample from the outer border of the flow + // field, as this would give lower-quality results. + // + // Note: As we never read from the border region at pyramid level 0, we + // can skip filling it in. If the conditions here are removed, or any + // other logic is added which reads from this border region, then + // compute_flow_field() will need to be modified to call + // fill_flow_field_borders() at pyramid level 0 to set up the correct + // border data. if (flow_x < 1 || (flow_x + 2) >= width) continue; if (flow_y < 1 || (flow_y + 2) >= height) continue; @@ -141,10 +163,18 @@ get_cubic_kernel_dbl(flow_sub_x, h_kernel); get_cubic_kernel_dbl(flow_sub_y, v_kernel); - const double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x], - stride, h_kernel, v_kernel); - const double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x], - stride, h_kernel, v_kernel); + double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x], + stride, h_kernel, v_kernel); + double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x], + stride, h_kernel, v_kernel); + + // Refine the interpolated flow vector one last time + const int patch_tl_x = x0 - DISFLOW_PATCH_CENTER; + const int patch_tl_y = y0 - DISFLOW_PATCH_CENTER; + aom_compute_flow_at_point( + src_pyr->layers[0].buffer, ref_pyr->layers[0].buffer, patch_tl_x, + patch_tl_y, src_pyr->layers[0].width, src_pyr->layers[0].height, + src_pyr->layers[0].stride, &flow_u, &flow_v); // Use original points (without offsets) when filling in correspondence // array @@ -420,16 +450,16 @@ // Calculate the bounds of the rectangle which was filled in by // compute_flow_field() before calling this function. // These indices are inclusive on both ends. - const int left_index = FLOW_BORDER; - const int right_index = (width - FLOW_BORDER - 1); - const int top_index = FLOW_BORDER; - const int bottom_index = (height - FLOW_BORDER - 1); + const int left_index = FLOW_BORDER_INNER; + const int right_index = (width - FLOW_BORDER_INNER - 1); + const int top_index = FLOW_BORDER_INNER; + const int bottom_index = (height - FLOW_BORDER_INNER - 1); // Left area for (int i = top_index; i <= bottom_index; i += 1) { double *row = flow + i * stride; const double left = row[left_index]; - for (int j = 0; j < left_index; j++) { + for (int j = -FLOW_BORDER_OUTER; j < left_index; j++) { row[j] = left; } } @@ -438,45 +468,178 @@ for (int i = top_index; i <= bottom_index; i += 1) { double *row = flow + i * stride; const double right = row[right_index]; - for (int j = right_index + 1; j < width; j++) { + for (int j = right_index + 1; j < width + FLOW_BORDER_OUTER; j++) { row[j] = right; } } // Top area - const double *top_row = flow + top_index * stride; - for (int i = 0; i < top_index; i++) { - double *row = flow + i * stride; - memcpy(row, top_row, width * sizeof(*row)); + const double *top_row = flow + top_index * stride - FLOW_BORDER_OUTER; + for (int i = -FLOW_BORDER_OUTER; i < top_index; i++) { + double *row = flow + i * stride - FLOW_BORDER_OUTER; + size_t length = width + 2 * FLOW_BORDER_OUTER; + memcpy(row, top_row, length * sizeof(*row)); } // Bottom area - const double *bottom_row = flow + bottom_index * stride; - for (int i = bottom_index + 1; i < height; i++) { - double *row = flow + i * stride; - memcpy(row, bottom_row, width * sizeof(*row)); + const double *bottom_row = flow + bottom_index * stride - FLOW_BORDER_OUTER; + for (int i = bottom_index + 1; i < height + FLOW_BORDER_OUTER; i++) { + double *row = flow + i * stride - FLOW_BORDER_OUTER; + size_t length = width + 2 * FLOW_BORDER_OUTER; + memcpy(row, bottom_row, length * sizeof(*row)); + } +} + +// Upscale one component of the flow field, from a size of +// cur_width x cur_height to a size of (2*cur_width) x (2*cur_height), storing +// the result back into the same buffer. This function also scales the flow +// vector by 2, so that when we move to the next pyramid level down, the implied +// motion vector is the same. +// +// The temporary buffer tmpbuf must be large enough to hold an intermediate +// array of size stride * cur_height, *plus* FLOW_BORDER_OUTER rows above and +// below. In other words, indices from -FLOW_BORDER_OUTER * stride to +// (cur_height + FLOW_BORDER_OUTER) * stride - 1 must be valid. +// +// Note that the same stride is used for u before and after upscaling +// and for the temporary buffer, for simplicity. +// +// A note on phasing: +// +// The flow fields at two adjacent pyramid levels are offset from each other, +// and we need to account for this in the construction of the interpolation +// kernels. +// +// Consider an 8x8 pixel patch at pyramid level n. This is split into four +// patches at pyramid level n-1. Bringing these patches back up to pyramid level +// n, each sub-patch covers 4x4 pixels, and between them they cover the same +// 8x8 region. +// +// Therefore, at pyramid level n, two adjacent patches look like this: +// +// + - - - - - - - + - - - - - - - + +// | | | +// | x x | x x | +// | | | +// | # | # | +// | | | +// | x x | x x | +// | | | +// + - - - - - - - + - - - - - - - + +// +// where # marks the center of a patch at pyramid level n (the input to this +// function), and x marks the center of a patch at pyramid level n-1 (the output +// of this function). +// +// By counting pixels (marked by +, -, and |), we can see that the flow vectors +// at pyramid level n-1 are offset relative to the flow vectors at pyramid +// level n, by 1/4 of the larger (input) patch size. Therefore, our +// interpolation kernels need to have phases of 0.25 and 0.75. +// +// In addition, in order to handle the frame edges correctly, we need to +// generate one output vector to the left and one to the right of each input +// vector, even though these must be interpolated using different source points. +static void upscale_flow_component(double *flow, int cur_width, int cur_height, + int stride, double *tmpbuf) { + const int half_len = FLOW_UPSCALE_TAPS / 2; + + // Check that the outer border is large enough to avoid needing to clamp + // the source locations + assert(half_len <= FLOW_BORDER_OUTER); + + // Horizontal upscale and multiply by 2 + for (int i = 0; i < cur_height; i++) { + for (int j = 0; j < cur_width; j++) { + double left = 0; + for (int k = -half_len; k < half_len; k++) { + left += + flow[i * stride + (j + k)] * flow_upscale_filter[0][k + half_len]; + } + tmpbuf[i * stride + (2 * j + 0)] = 2.0 * left; + + // Right output pixel is 0.25 units to the right of the input pixel + double right = 0; + for (int k = -(half_len - 1); k < (half_len + 1); k++) { + right += flow[i * stride + (j + k)] * + flow_upscale_filter[1][k + (half_len - 1)]; + } + tmpbuf[i * stride + (2 * j + 1)] = 2.0 * right; + } + } + + // Fill in top and bottom borders of tmpbuf + const double *top_row = &tmpbuf[0]; + for (int i = -FLOW_BORDER_OUTER; i < 0; i++) { + double *row = &tmpbuf[i * stride]; + memcpy(row, top_row, 2 * cur_width * sizeof(*row)); + } + + const double *bottom_row = &tmpbuf[(cur_height - 1) * stride]; + for (int i = cur_height; i < cur_height + FLOW_BORDER_OUTER; i++) { + double *row = &tmpbuf[i * stride]; + memcpy(row, bottom_row, 2 * cur_width * sizeof(*row)); + } + + // Vertical upscale + int upscaled_width = cur_width * 2; + for (int i = 0; i < cur_height; i++) { + for (int j = 0; j < upscaled_width; j++) { + double top = 0; + for (int k = -half_len; k < half_len; k++) { + top += + tmpbuf[(i + k) * stride + j] * flow_upscale_filter[0][k + half_len]; + } + flow[(2 * i) * stride + j] = top; + + double bottom = 0; + for (int k = -(half_len - 1); k < (half_len + 1); k++) { + bottom += tmpbuf[(i + k) * stride + j] * + flow_upscale_filter[1][k + (half_len - 1)]; + } + flow[(2 * i + 1) * stride + j] = bottom; + } } } // make sure flow_u and flow_v start at 0 static bool compute_flow_field(const ImagePyramid *src_pyr, - const ImagePyramid *ref_pyr, FlowField *flow) { + const ImagePyramid *ref_pyr, int n_levels, + FlowField *flow) { bool mem_status = true; - assert(src_pyr->n_levels == ref_pyr->n_levels); double *flow_u = flow->u; double *flow_v = flow->v; - const size_t flow_size = flow->stride * (size_t)flow->height; - double *u_upscale = aom_malloc(flow_size * sizeof(*u_upscale)); - double *v_upscale = aom_malloc(flow_size * sizeof(*v_upscale)); - if (!u_upscale || !v_upscale) { - mem_status = false; - goto free_uvscale; + double *tmpbuf0; + double *tmpbuf; + + if (n_levels < 2) { + // tmpbuf not needed + tmpbuf0 = NULL; + tmpbuf = NULL; + } else { + // This line must match the calculation of cur_flow_height below + const int layer1_height = src_pyr->layers[1].height >> DOWNSAMPLE_SHIFT; + + const size_t tmpbuf_size = + (layer1_height + 2 * FLOW_BORDER_OUTER) * flow->stride; + tmpbuf0 = aom_malloc(tmpbuf_size * sizeof(*tmpbuf0)); + if (!tmpbuf0) { + mem_status = false; + goto free_tmpbuf; + } + tmpbuf = tmpbuf0 + FLOW_BORDER_OUTER * flow->stride; } // Compute flow field from coarsest to finest level of the pyramid - for (int level = src_pyr->n_levels - 1; level >= 0; --level) { + // + // Note: We stop after refining pyramid level 1 and interpolating it to + // generate an initial flow field at level 0. We do *not* refine the dense + // flow field at level 0. Instead, we wait until we have generated + // correspondences by interpolating this flow field, and then refine the + // correspondences themselves. This is both faster and gives better output + // compared to refining the flow field at level 0 and then interpolating. + for (int level = n_levels - 1; level >= 1; --level) { const PyramidLayer *cur_layer = &src_pyr->layers[level]; const int cur_width = cur_layer->width; const int cur_height = cur_layer->height; @@ -489,8 +652,10 @@ const int cur_flow_height = cur_height >> DOWNSAMPLE_SHIFT; const int cur_flow_stride = flow->stride; - for (int i = FLOW_BORDER; i < cur_flow_height - FLOW_BORDER; i += 1) { - for (int j = FLOW_BORDER; j < cur_flow_width - FLOW_BORDER; j += 1) { + for (int i = FLOW_BORDER_INNER; i < cur_flow_height - FLOW_BORDER_INNER; + i += 1) { + for (int j = FLOW_BORDER_INNER; j < cur_flow_width - FLOW_BORDER_INNER; + j += 1) { const int flow_field_idx = i * cur_flow_stride + j; // Calculate the position of a patch of size DISFLOW_PATCH_SIZE pixels, @@ -523,28 +688,10 @@ const int upscale_flow_height = cur_flow_height << 1; const int upscale_stride = flow->stride; - bool upscale_u_plane = av1_upscale_plane_double_prec( - flow_u, cur_flow_height, cur_flow_width, cur_flow_stride, u_upscale, - upscale_flow_height, upscale_flow_width, upscale_stride); - bool upscale_v_plane = av1_upscale_plane_double_prec( - flow_v, cur_flow_height, cur_flow_width, cur_flow_stride, v_upscale, - upscale_flow_height, upscale_flow_width, upscale_stride); - if (!upscale_u_plane || !upscale_v_plane) { - mem_status = false; - goto free_uvscale; - } - - // Multiply all flow vectors by 2. - // When we move down a pyramid level, the image resolution doubles. - // Thus we need to double all vectors in order for them to represent - // the same translation at the next level down - for (int i = 0; i < upscale_flow_height; i++) { - for (int j = 0; j < upscale_flow_width; j++) { - const int index = i * upscale_stride + j; - flow_u[index] = u_upscale[index] * 2.0; - flow_v[index] = v_upscale[index] * 2.0; - } - } + upscale_flow_component(flow_u, cur_flow_width, cur_flow_height, + cur_flow_stride, tmpbuf); + upscale_flow_component(flow_v, cur_flow_width, cur_flow_height, + cur_flow_stride, tmpbuf); // If we didn't fill in the rightmost column or bottommost row during // upsampling (in order to keep the ratio to exactly 2), fill them @@ -574,9 +721,9 @@ } } } -free_uvscale: - aom_free(u_upscale); - aom_free(v_upscale); + +free_tmpbuf: + aom_free(tmpbuf0); return mem_status; } @@ -587,25 +734,25 @@ // Calculate the size of the bottom (largest) layer of the flow pyramid flow->width = frame_width >> DOWNSAMPLE_SHIFT; flow->height = frame_height >> DOWNSAMPLE_SHIFT; - flow->stride = flow->width; + flow->stride = flow->width + 2 * FLOW_BORDER_OUTER; - const size_t flow_size = flow->stride * (size_t)flow->height; - flow->u = aom_calloc(flow_size, sizeof(*flow->u)); - flow->v = aom_calloc(flow_size, sizeof(*flow->v)); + const size_t flow_size = + flow->stride * (size_t)(flow->height + 2 * FLOW_BORDER_OUTER); - if (flow->u == NULL || flow->v == NULL) { - aom_free(flow->u); - aom_free(flow->v); + flow->buf0 = aom_calloc(2 * flow_size, sizeof(*flow->buf0)); + if (!flow->buf0) { aom_free(flow); return NULL; } + flow->u = flow->buf0 + FLOW_BORDER_OUTER * flow->stride + FLOW_BORDER_OUTER; + flow->v = flow->u + flow_size; + return flow; } static void free_flow_field(FlowField *flow) { - aom_free(flow->u); - aom_free(flow->v); + aom_free(flow->buf0); aom_free(flow); } @@ -615,28 +762,30 @@ // Following the convention in flow_estimation.h, the flow vectors are computed // at fixed points in `src` and point to the corresponding locations in `ref`, // regardless of the temporal ordering of the frames. -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed) { +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { // Precompute information we will need about each frame ImagePyramid *src_pyramid = src->y_pyramid; CornerList *src_corners = src->corners; ImagePyramid *ref_pyramid = ref->y_pyramid; - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + + const int src_layers = + aom_compute_pyramid(src, bit_depth, DISFLOW_PYRAMID_LEVELS, src_pyramid); + const int ref_layers = + aom_compute_pyramid(ref, bit_depth, DISFLOW_PYRAMID_LEVELS, ref_pyramid); + + if (src_layers < 0 || ref_layers < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { - *mem_alloc_failed = true; - return false; - } + + assert(src_layers == ref_layers); const int src_width = src_pyramid->layers[0].width; const int src_height = src_pyramid->layers[0].height; @@ -649,7 +798,7 @@ return false; } - if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) { + if (!compute_flow_field(src_pyramid, ref_pyramid, src_layers, flow)) { *mem_alloc_failed = true; free_flow_field(flow); return false; @@ -664,8 +813,8 @@ return false; } - const int num_correspondences = - determine_disflow_correspondence(src_corners, flow, correspondences); + const int num_correspondences = determine_disflow_correspondence( + src_pyramid, ref_pyramid, src_corners, flow, correspondences); bool result = ransac(correspondences, num_correspondences, type, motion_models, num_motion_models, mem_alloc_failed);
diff --git a/aom_dsp/flow_estimation/disflow.h b/aom_dsp/flow_estimation/disflow.h index d772c8a..ac36800 100644 --- a/aom_dsp/flow_estimation/disflow.h +++ b/aom_dsp/flow_estimation/disflow.h
@@ -15,7 +15,6 @@ #include <stdbool.h> #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_dsp/rect.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus @@ -79,6 +78,9 @@ #define DISFLOW_INTERP_BITS 14 typedef struct { + // Start of allocation for u and v buffers + double *buf0; + // x and y directions of flow, per patch double *u; double *v; @@ -89,12 +91,10 @@ int stride; } FlowField; -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed); +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus }
diff --git a/aom_dsp/flow_estimation/flow_estimation.c b/aom_dsp/flow_estimation/flow_estimation.c index 0f47f86..96624eb 100644 --- a/aom_dsp/flow_estimation/flow_estimation.c +++ b/aom_dsp/flow_estimation/flow_estimation.c
@@ -18,14 +18,6 @@ #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = { - 1, // GLOBAL_MOTION_METHOD_FEATURE_MATCH - 16, // GLOBAL_MOTION_METHOD_DISFLOW -}; - // clang-format off const double kIdentityParams[MAX_PARAMDIM] = { 0.0, 0.0, 1.0, 0.0, 0.0, 1.0 @@ -43,17 +35,17 @@ bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { switch (gm_method) { case GLOBAL_MOTION_METHOD_FEATURE_MATCH: return av1_compute_global_motion_feature_match( - type, src, ref, bit_depth, motion_models, num_motion_models, - mem_alloc_failed); + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); case GLOBAL_MOTION_METHOD_DISFLOW: - return av1_compute_global_motion_disflow(type, src, ref, bit_depth, - motion_models, num_motion_models, - mem_alloc_failed); + return av1_compute_global_motion_disflow( + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); default: assert(0 && "Unknown global motion estimation type"); } return false;
diff --git a/aom_dsp/flow_estimation/flow_estimation.h b/aom_dsp/flow_estimation/flow_estimation.h index 2dfae24..a38b03f 100644 --- a/aom_dsp/flow_estimation/flow_estimation.h +++ b/aom_dsp/flow_estimation/flow_estimation.h
@@ -61,11 +61,6 @@ double rx, ry; } Correspondence; -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS]; - // Which global motion method should we use in practice? // Disflow is both faster and gives better results than feature matching in // practically all cases, so we use disflow by default @@ -85,7 +80,7 @@ bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus
diff --git a/aom_dsp/flow_estimation/ransac.c b/aom_dsp/flow_estimation/ransac.c index b88a07b..7c7bebd 100644 --- a/aom_dsp/flow_estimation/ransac.c +++ b/aom_dsp/flow_estimation/ransac.c
@@ -29,8 +29,13 @@ #define INLIER_THRESHOLD 1.25 #define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD) + +// Number of initial models to generate #define NUM_TRIALS 20 +// Number of times to refine the best model found +#define NUM_REFINES 5 + // Flag to enable functions for finding TRANSLATION type models. // // These modes are not considered currently due to a spec bug (see comments @@ -39,63 +44,110 @@ // but disabled, for completeness. #define ALLOW_TRANSLATION_MODELS 0 +typedef struct { + int num_inliers; + double sse; // Sum of squared errors of inliers + int *inlier_indices; +} RANSAC_MOTION; + //////////////////////////////////////////////////////////////////////////////// // ransac -typedef bool (*IsDegenerateFunc)(double *p); -typedef bool (*FindTransformationFunc)(int points, const double *points1, - const double *points2, double *params); -typedef void (*ProjectPointsFunc)(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj); +typedef bool (*FindTransformationFunc)(const Correspondence *points, + const int *indices, int num_indices, + double *params); +typedef void (*ScoreModelFunc)(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model); // vtable-like structure which stores all of the information needed by RANSAC // for a particular model type typedef struct { - IsDegenerateFunc is_degenerate; FindTransformationFunc find_transformation; - ProjectPointsFunc project_points; + ScoreModelFunc score_model; + + // The minimum number of points which can be passed to find_transformation + // to generate a model. + // + // This should be set as small as possible. This is due to an observation + // from section 4 of "Optimal Ransac" by A. Hast, J. Nysjö and + // A. Marchetti (https://dspace5.zcu.cz/bitstream/11025/6869/1/Hast.pdf): + // using the minimum possible number of points in the initial model maximizes + // the chance that all of the selected points are inliers. + // + // That paper proposes a method which can deal with models which are + // contaminated by outliers, which helps in cases where the inlier fraction + // is low. However, for our purposes, global motion only gives significant + // gains when the inlier fraction is high. + // + // So we do not use the method from this paper, but we do find that + // minimizing the number of points used for initial model fitting helps + // make the best use of the limited number of models we consider. int minpts; } RansacModelInfo; #if ALLOW_TRANSLATION_MODELS -static void project_points_translation(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = x + mat[0]; - *(proj++) = y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_translation(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = x1 + mat[0]; + const double proj_y = y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #endif // ALLOW_TRANSLATION_MODELS -static void project_points_affine(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = mat[2] * x + mat[3] * y + mat[0]; - *(proj++) = mat[4] * x + mat[5] * y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_affine(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = mat[2] * x1 + mat[3] * y1 + mat[0]; + const double proj_y = mat[4] * x1 + mat[5] * y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #if ALLOW_TRANSLATION_MODELS -static bool find_translation(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_translation(const Correspondence *points, const int *indices, + int num_indices, double *params) { double sumx = 0; double sumy = 0; - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; sumx += dx - sx; sumy += dy - sy; @@ -111,8 +163,8 @@ } #endif // ALLOW_TRANSLATION_MODELS -static bool find_rotzoom(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_rotzoom(const Correspondence *points, const int *indices, + int num_indices, double *params) { const int n = 4; // Size of least-squares problem double mat[4 * 4]; // Accumulator for A'A double y[4]; // Accumulator for A'b @@ -120,11 +172,12 @@ double b; // Single element of b least_squares_init(mat, y, n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0] = 1; a[1] = 0; @@ -153,8 +206,8 @@ return true; } -static bool find_affine(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_affine(const Correspondence *points, const int *indices, + int num_indices, double *params) { // Note: The least squares problem for affine models is 6-dimensional, // but it splits into two independent 3-dimensional subproblems. // Solving these two subproblems separately and recombining at the end @@ -174,11 +227,12 @@ least_squares_init(mat[0], y[0], n); least_squares_init(mat[1], y[1], n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0][0] = 1; a[0][1] = sx; @@ -211,12 +265,6 @@ return true; } -typedef struct { - int num_inliers; - double sse; // Sum of squared errors of inliers - int *inlier_indices; -} RANSAC_MOTION; - // Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. static int compare_motions(const void *arg_a, const void *arg_b) { const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; @@ -234,15 +282,6 @@ return compare_motions(motion_a, motion_b) < 0; } -static void copy_points_at_indices(double *dest, const double *src, - const int *indices, int num_points) { - for (int i = 0; i < num_points; ++i) { - const int index = indices[i]; - dest[i * 2] = src[index * 2]; - dest[i * 2 + 1] = src[index * 2 + 1]; - } -} - // Returns true on success, false on error static bool ransac_internal(const Correspondence *matched_points, int npoints, MotionModel *motion_models, int num_desired_motions, @@ -257,10 +296,6 @@ int indices[MAX_MINPTS] = { 0 }; - double *points1, *points2; - double *corners1, *corners2; - double *projected_corners; - // Store information for the num_desired_motions best transformations found // and the worst motion among them, as well as the motion currently under // consideration. @@ -271,18 +306,19 @@ // currently under consideration. double params_this_motion[MAX_PARAMDIM]; + // Initialize output models, as a fallback in case we can't find a model + for (i = 0; i < num_desired_motions; i++) { + memcpy(motion_models[i].params, kIdentityParams, + MAX_PARAMDIM * sizeof(*(motion_models[i].params))); + motion_models[i].num_inliers = 0; + } + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { return false; } int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts); - points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); - points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); - corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); - corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); - projected_corners = - (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2); motions = (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION)); @@ -295,8 +331,7 @@ int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints * (num_desired_motions + 1)); - if (!(points1 && points2 && corners1 && corners2 && projected_corners && - motions && inlier_buffer)) { + if (!(motions && inlier_buffer)) { ret_val = false; *mem_alloc_failed = true; goto finish_ransac; @@ -311,50 +346,22 @@ memset(¤t_motion, 0, sizeof(current_motion)); current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints; - for (i = 0; i < npoints; ++i) { - corners1[2 * i + 0] = matched_points[i].x; - corners1[2 * i + 1] = matched_points[i].y; - corners2[2 * i + 0] = matched_points[i].rx; - corners2[2 * i + 1] = matched_points[i].ry; - } - for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) { lcg_pick(npoints, minpts, indices, &seed); - copy_points_at_indices(points1, corners1, indices, minpts); - copy_points_at_indices(points2, corners2, indices, minpts); - - if (model_info->is_degenerate(points1)) { - continue; - } - - if (!model_info->find_transformation(minpts, points1, points2, + if (!model_info->find_transformation(matched_points, indices, minpts, params_this_motion)) { continue; } - model_info->project_points(params_this_motion, corners1, projected_corners, - npoints, 2, 2); - - current_motion.num_inliers = 0; - double sse = 0.0; - for (i = 0; i < npoints; ++i) { - double dx = projected_corners[i * 2] - corners2[i * 2]; - double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1]; - double squared_error = dx * dx + dy * dy; - - if (squared_error < INLIER_THRESHOLD_SQUARED) { - current_motion.inlier_indices[current_motion.num_inliers++] = i; - sse += squared_error; - } - } + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); if (current_motion.num_inliers < min_inliers) { // Reject models with too few inliers continue; } - current_motion.sse = sse; if (is_better_motion(¤t_motion, worst_kept_motion)) { // This motion is better than the worst currently kept motion. Remember // the inlier points and sse. The parameters for each kept motion @@ -386,86 +393,98 @@ // Sort the motions, best first. qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); - // Recompute the motions using only the inliers. + // Refine each of the best N models using iterative estimation. + // + // The idea here is loosely based on the iterative method from + // "Locally Optimized RANSAC" by O. Chum, J. Matas and Josef Kittler: + // https://cmp.felk.cvut.cz/ftp/articles/matas/chum-dagm03.pdf + // + // However, we implement a simpler version than their proposal, and simply + // refit the model repeatedly until the number of inliers stops increasing, + // with a cap on the number of iterations to defend against edge cases which + // only improve very slowly. for (i = 0; i < num_desired_motions; ++i) { - int num_inliers = motions[i].num_inliers; - if (num_inliers > 0) { - assert(num_inliers >= minpts); - - copy_points_at_indices(points1, corners1, motions[i].inlier_indices, - num_inliers); - copy_points_at_indices(points2, corners2, motions[i].inlier_indices, - num_inliers); - - if (!model_info->find_transformation(num_inliers, points1, points2, - motion_models[i].params)) { - // In the unlikely event that this model fitting fails, - // we don't have a good fallback. So just clear the output - // model and move on - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; - continue; - } - - // Populate inliers array - for (int j = 0; j < num_inliers; j++) { - int index = motions[i].inlier_indices[j]; - const Correspondence *corr = &matched_points[index]; - motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); - motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); - } - motion_models[i].num_inliers = num_inliers; - } else { - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; + if (motions[i].num_inliers <= 0) { + // Output model has already been initialized to the identity model, + // so just skip setup + continue; } + + bool bad_model = false; + for (int refine_count = 0; refine_count < NUM_REFINES; refine_count++) { + int num_inliers = motions[i].num_inliers; + assert(num_inliers >= min_inliers); + + if (!model_info->find_transformation(matched_points, + motions[i].inlier_indices, + num_inliers, params_this_motion)) { + // In the unlikely event that this model fitting fails, we don't have a + // good fallback. So leave this model set to the identity model + bad_model = true; + break; + } + + // Score the newly generated model + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); + + // At this point, there are three possibilities: + // 1) If we found more inliers, keep refining. + // 2) If we found the same number of inliers but a lower SSE, we want to + // keep the new model, but further refinement is unlikely to gain much. + // So commit to this new model + // 3) It is possible, but very unlikely, that the new model will have + // fewer inliers. If it does happen, we probably just lost a few + // borderline inliers. So treat the same as case (2). + if (current_motion.num_inliers > motions[i].num_inliers) { + motions[i].num_inliers = current_motion.num_inliers; + motions[i].sse = current_motion.sse; + int *tmp = motions[i].inlier_indices; + motions[i].inlier_indices = current_motion.inlier_indices; + current_motion.inlier_indices = tmp; + } else { + // Refined model is no better, so stop + // This shouldn't be significantly worse than the previous model, + // so it's fine to use the parameters in params_this_motion. + // This saves us from having to cache the previous iteration's params. + break; + } + } + + if (bad_model) continue; + + // Fill in output struct + memcpy(motion_models[i].params, params_this_motion, + MAX_PARAMDIM * sizeof(*motion_models[i].params)); + for (int j = 0; j < motions[i].num_inliers; j++) { + int index = motions[i].inlier_indices[j]; + const Correspondence *corr = &matched_points[index]; + motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); + motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + } + motion_models[i].num_inliers = motions[i].num_inliers; } finish_ransac: aom_free(inlier_buffer); aom_free(motions); - aom_free(projected_corners); - aom_free(corners2); - aom_free(corners1); - aom_free(points2); - aom_free(points1); return ret_val; } -static bool is_collinear3(double *p1, double *p2, double *p3) { - static const double collinear_eps = 1e-3; - const double v = - (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); - return fabs(v) < collinear_eps; -} - -#if ALLOW_TRANSLATION_MODELS -static bool is_degenerate_translation(double *p) { - return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; -} -#endif // ALLOW_TRANSLATION_MODELS - -static bool is_degenerate_affine(double *p) { - return is_collinear3(p, p + 2, p + 4); -} - static const RansacModelInfo ransac_model_info[TRANS_TYPES] = { // IDENTITY - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, // TRANSLATION #if ALLOW_TRANSLATION_MODELS - { is_degenerate_translation, find_translation, project_points_translation, - 3 }, + { find_translation, score_translation, 1 }, #else - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, #endif // ROTZOOM - { is_degenerate_affine, find_rotzoom, project_points_affine, 3 }, + { find_rotzoom, score_affine, 2 }, // AFFINE - { is_degenerate_affine, find_affine, project_points_affine, 3 }, + { find_affine, score_affine, 3 }, }; // Returns true on success, false on error
diff --git a/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/aom_dsp/flow_estimation/x86/corner_match_avx2.c index 87c76fa..ff69ae7 100644 --- a/aom_dsp/flow_estimation/x86/corner_match_avx2.c +++ b/aom_dsp/flow_estimation/x86/corner_match_avx2.c
@@ -17,64 +17,112 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(32, static const uint16_t, ones_array[16]) = { 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_avx2.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the -correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows -of each image, centered at (x1, y1) and (x2, y2) respectively. -*/ -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i, stride1_i = 0, stride2_i = 0; - __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m256i zero = _mm256_setzero_si256(); - __m128i v1, v2; +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev - sum_vec = zero; - sumsq2_vec = zero; - cross_vec = zero; + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. +*/ +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, + int y, double *mean, + double *one_over_stddev) { + __m256i sum_vec = _mm256_setzero_si256(); + __m256i sumsq_vec = _mm256_setzero_si256(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame)); + + sum_vec = _mm256_add_epi16(sum_vec, v); + sumsq_vec = _mm256_add_epi32(sumsq_vec, _mm256_madd_epi16(v, v)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 8x32-bit values, hadd() to perform 8 + // additions, sum vertically to do 4 more, then the last 2 in scalar code. + const __m256i ones = _mm256_load_si256((__m256i *)ones_array); + const __m256i partial_sum = _mm256_madd_epi16(sum_vec, ones); + const __m256i tmp_8x32 = _mm256_hadd_epi32(partial_sum, sumsq_vec); + const __m128i tmp_4x32 = _mm_add_epi32(_mm256_extracti128_si256(tmp_8x32, 0), + _mm256_extracti128_si256(tmp_8x32, 1)); + const int sum = + _mm_extract_epi32(tmp_4x32, 0) + _mm_extract_epi32(tmp_4x32, 1); + const int sumsq = + _mm_extract_epi32(tmp_4x32, 2) + _mm_extract_epi32(tmp_4x32, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + __m256i cross_vec = _mm256_setzero_si256(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask); - v1_1 = _mm256_cvtepu8_epi16(v1); - v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask); - v2_1 = _mm256_cvtepu8_epi16(v2); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame1)); + const __m256i v2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame2)); - v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); - sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1, v2)); - sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); - cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); - stride1_i += stride1; - stride2_i += stride2; + frame1 += stride1; + frame2 += stride2; } - __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); - sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); - int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); - int sum2_acc = _mm256_extract_epi32(sum_vec, 4); - __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); - __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); - temp1 = _mm256_add_epi32(unp_low, unp_hig); + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(_mm256_extracti128_si256(cross_vec, 0), + _mm256_extracti128_si256(cross_vec, 1)); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); - __m128i low_sumsq = _mm256_castsi256_si128(temp1); - low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); - low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); - int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); - int cross_acc = _mm_extract_epi32(low_sumsq, 2); - - int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; - int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; - return cov / sqrt((double)var2); + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; }
diff --git a/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/aom_dsp/flow_estimation/x86/corner_match_sse4.c index b3cb5bc..bff7db6 100644 --- a/aom_dsp/flow_estimation/x86/corner_match_sse4.c +++ b/aom_dsp/flow_estimation/x86/corner_match_sse4.c
@@ -21,84 +21,125 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(16, static const uint16_t, ones_array[8]) = { 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_sse4.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. -*/ -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i; - // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, - // 2) - __m128i sum1_vec = _mm_setzero_si128(); - __m128i sum2_vec = _mm_setzero_si128(); - // 4 32-bit partial sums of squares - __m128i sumsq2_vec = _mm_setzero_si128(); - __m128i cross_vec = _mm_setzero_si128(); +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m128i zero = _mm_setzero_si128(); + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. +*/ +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, + int x, int y, double *mean, + double *one_over_stddev) { + // 8 16-bit partial sums of pixels + // Each lane sums at most 2*MATCH_SZ pixels, which can have values up to 255, + // and is therefore at most 2*MATCH_SZ*255, which is > 2^8 but < 2^16. + // Thus this value is safe to store in 16 bits. + __m128i sum_vec = _mm_setzero_si128(); + + // 8 32-bit partial sums of squares + __m128i sumsq_vec_l = _mm_setzero_si128(); + __m128i sumsq_vec_r = _mm_setzero_si128(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v = _mm_loadu_si128((__m128i *)frame); + const __m128i v_l = _mm_cvtepu8_epi16(v); + const __m128i v_r = _mm_cvtepu8_epi16(_mm_srli_si128(v, 8)); + + sum_vec = _mm_add_epi16(sum_vec, _mm_add_epi16(v_l, v_r)); + sumsq_vec_l = _mm_add_epi32(sumsq_vec_l, _mm_madd_epi16(v_l, v_l)); + sumsq_vec_r = _mm_add_epi32(sumsq_vec_r, _mm_madd_epi16(v_r, v_r)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 4x32-bit values, hadd() to perform four + // additions, then perform the last two additions in scalar code. + const __m128i ones = _mm_load_si128((__m128i *)ones_array); + const __m128i partial_sum = _mm_madd_epi16(sum_vec, ones); + const __m128i partial_sumsq = _mm_add_epi32(sumsq_vec_l, sumsq_vec_r); + const __m128i tmp = _mm_hadd_epi32(partial_sum, partial_sumsq); + const int sum = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1); + const int sumsq = _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + // 8 32-bit partial sums of products + __m128i cross_vec_l = _mm_setzero_si128(); + __m128i cross_vec_r = _mm_setzero_si128(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - const __m128i v1 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask); - const __m128i v2 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask); - - // Using the 'sad' intrinsic here is a bit faster than adding - // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit - // conversion step later, for a net speedup of ~10% - sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); - sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = _mm_loadu_si128((__m128i *)frame1); + const __m128i v2 = _mm_loadu_si128((__m128i *)frame2); const __m128i v1_l = _mm_cvtepu8_epi16(v1); const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); const __m128i v2_l = _mm_cvtepu8_epi16(v2); const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); - sumsq2_vec = _mm_add_epi32( - sumsq2_vec, - _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); - cross_vec = _mm_add_epi32( - cross_vec, - _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + cross_vec_l = _mm_add_epi32(cross_vec_l, _mm_madd_epi16(v1_l, v2_l)); + cross_vec_r = _mm_add_epi32(cross_vec_r, _mm_madd_epi16(v1_r, v2_r)); + + frame1 += stride1; + frame2 += stride2; } - // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, - // cross_vec) - // as holding 4 32-bit elements each, which we want to sum horizontally. - // We do this by transposing and then summing vertically. - __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); - __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); - __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); - __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(cross_vec_l, cross_vec_r); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); - __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); - __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); - __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); - __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); - - __m128i res = - _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); - - int sum1 = _mm_extract_epi32(res, 0); - int sum2 = _mm_extract_epi32(res, 1); - int sumsq2 = _mm_extract_epi32(res, 2); - int cross = _mm_extract_epi32(res, 3); - - int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - int cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; }
diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c new file mode 100644 index 0000000..ad5a1bd --- /dev/null +++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -0,0 +1,417 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <math.h> +#include <immintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "config/aom_dsp_rtcd.h" + +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_avx2.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); +} + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +// +// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation +// instad of bicubic interpolation +static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, + const int16_t *dx, const int16_t *dy, + int *b) { + const __m256i zero = _mm256_setzero_si256(); + + // Accumulate 8 32-bit partial sums for each element of b + // These will be flattened at the end. + __m256i b0_acc = _mm256_setzero_si256(); + __m256i b1_acc = _mm256_setzero_si256(); + + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); + + // Storage for intermediate values between the two convolution directions + // In the AVX2 implementation, this needs a dummy row at the end, because + // we generate 2 rows at a time but the total number of rows is odd. + // So we generate one more row than we actually need. + DECLARE_ALIGNED(32, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 4)]); + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution + + // Prepare the kernel vectors + // We split the kernel into two vectors with kernel indices: + // 0, 1, 0, 1, 0, 1, 0, 1, and + // 2, 3, 2, 3, 2, 3, 2, 3 + __m256i h_kernel_01 = _mm256_broadcastd_epi32(kernels); + __m256i h_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 8)); + + __m256i round_const_h = _mm256_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); + + for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; i += 2) { + const int y_w = y0 + i; + const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load this row of pixels. + // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, + // for a total of 11 pixels. Here we load 16 pixels, but only use + // the first 11. + __m256i row = + yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row); + + // Expand pixels to int16s + // We must use unpacks here, as we have one row in each 128-bit lane + // and want to handle each of those independently. + // This is in contrast to _mm256_cvtepu8_epi16(), which takes a single + // 128-bit input and widens it to 256 bits. + __m256i px_0to7_i16 = _mm256_unpacklo_epi8(row, zero); + __m256i px_4to10_i16 = + _mm256_unpacklo_epi8(_mm256_srli_si256(row, 4), zero); + + // Compute first four outputs + // input pixels 0, 1, 1, 2, 2, 3, 3, 4 + // * kernel 0, 1, 0, 1, 0, 1, 0, 1 + __m256i px0 = + _mm256_unpacklo_epi16(px_0to7_i16, _mm256_srli_si256(px_0to7_i16, 2)); + // input pixels 2, 3, 3, 4, 4, 5, 5, 6 + // * kernel 2, 3, 2, 3, 2, 3, 2, 3 + __m256i px1 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_0to7_i16, 4), + _mm256_srli_si256(px_0to7_i16, 6)); + // Convolve with kernel and sum 2x2 boxes to form first 4 outputs + __m256i sum0 = _mm256_add_epi32(_mm256_madd_epi16(px0, h_kernel_01), + _mm256_madd_epi16(px1, h_kernel_23)); + + __m256i out0 = _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_h), + DISFLOW_INTERP_BITS - 6); + + // Compute second four outputs + __m256i px2 = + _mm256_unpacklo_epi16(px_4to10_i16, _mm256_srli_si256(px_4to10_i16, 2)); + __m256i px3 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_4to10_i16, 4), + _mm256_srli_si256(px_4to10_i16, 6)); + __m256i sum1 = _mm256_add_epi32(_mm256_madd_epi16(px2, h_kernel_01), + _mm256_madd_epi16(px3, h_kernel_23)); + + // Round by just enough bits that the result is + // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 + // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 + // as it does now + // This means shifting down so we have 6 extra bits, for a maximum value + // of +18360, which can occur if u_frac == 0.5 and the input pixels are + // {0, 255, 255, 0}. + __m256i out1 = _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_h), + DISFLOW_INTERP_BITS - 6); + + _mm256_storeu_si256((__m256i *)tmp_row, _mm256_packs_epi32(out0, out1)); + } + + // Vertical convolution + const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; + __m256i round_const_v = _mm256_set1_epi32(1 << (round_bits - 1)); + + __m256i v_kernel_01 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 4)); + __m256i v_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 12)); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load 5 rows of 8 x 16-bit values, and pack into 4 registers + // holding rows {0, 1}, {1, 2}, {2, 3}, {3, 4} + __m128i row0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i row1 = _mm_loadu_si128((__m128i *)tmp_row); + __m128i row2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + __m128i row3 = + _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); + __m128i row4 = + _mm_loadu_si128((__m128i *)(tmp_row + 3 * DISFLOW_PATCH_SIZE)); + + __m256i px0 = _mm256_set_m128i(row1, row0); + __m256i px1 = _mm256_set_m128i(row2, row1); + __m256i px2 = _mm256_set_m128i(row3, row2); + __m256i px3 = _mm256_set_m128i(row4, row3); + + // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , + // but each multiply expands its output to 32 bits. So we need to be + // a little clever about how we do this + __m256i sum0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpacklo_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpacklo_epi16(px2, px3), v_kernel_23)); + __m256i sum1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpackhi_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpackhi_epi16(px2, px3), v_kernel_23)); + + __m256i sum0_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_v), round_bits); + __m256i sum1_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_v), round_bits); + + __m256i warped = _mm256_packs_epi32(sum0_rounded, sum1_rounded); + __m128i src_pixels_u8 = xx_loadu_2x64(&src[(y + i + 1) * stride + x], + &src[(y + i) * stride + x]); + __m256i src_pixels = + _mm256_slli_epi16(_mm256_cvtepu8_epi16(src_pixels_u8), 3); + + // Calculate delta from the target patch + __m256i dt = _mm256_sub_epi16(warped, src_pixels); + + // Load 2x8 elements each of dx and dt, to pair with the 2x8 elements of dt + // that we have just computed. Then compute 2x8 partial sums of dx * dt + // and dy * dt, implicitly sum to give 2x4 partial sums of each, and + // accumulate. + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * DISFLOW_PATCH_SIZE]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * DISFLOW_PATCH_SIZE]); + b0_acc = _mm256_add_epi32(b0_acc, _mm256_madd_epi16(dx_row, dt)); + b1_acc = _mm256_add_epi32(b1_acc, _mm256_madd_epi16(dy_row, dt)); + } + + // Flatten the two sets of partial sums to find the final value of b + // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). + // We need to do 14 additions in total; a `hadd` instruction can take care + // of eight of them, then a vertical sum can do four more, leaving two + // scalar additions. + __m256i partial_sum_256 = _mm256_hadd_epi32(b0_acc, b1_acc); + __m128i partial_sum = + _mm_add_epi32(_mm256_extracti128_si256(partial_sum_256, 0), + _mm256_extracti128_si256(partial_sum_256, 1)); + b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); + b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); +} + +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + const __m256i zero = _mm256_setzero_si256(); + + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m256i row_m1_0 = + yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1)); + __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero); + __m256i row_m1_0_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero); + __m256i row_m1_0_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 2), zero); + + __m256i row_m1_0_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_a, row_m1_0_c), + _mm256_slli_epi16(row_m1_0_b, 1)); + __m256i row_m1_0_hdiff = _mm256_sub_epi16(row_m1_0_a, row_m1_0_c); + + // Main loop: For each pair of output rows (i, i+1): + // * Load rows (i+1, i+2) and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + // Load rows (i+1, i+2) and apply both horizontal filters + const __m256i row_p1_p2 = + yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1), + (__m128i *)(src + (i + 1) * src_stride - 1)); + const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero); + const __m256i row_p1_p2_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero); + const __m256i row_p1_p2_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 2), zero); + + const __m256i row_p1_p2_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_p1_p2_a, row_p1_p2_c), + _mm256_slli_epi16(row_p1_p2_b, 1)); + const __m256i row_p1_p2_hdiff = _mm256_sub_epi16(row_p1_p2_a, row_p1_p2_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m256i row_0_p1_hdiff = + _mm256_permute2x128_si256(row_m1_0_hdiff, row_p1_p2_hdiff, 0x21); + const __m256i dx_row = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_hdiff, row_p1_p2_hdiff), + _mm256_slli_epi16(row_0_p1_hdiff, 1)); + const __m256i dy_row = + _mm256_sub_epi16(row_m1_0_hsmooth, row_p1_p2_hsmooth); + + _mm256_storeu_si256((__m256i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm256_storeu_si256((__m256i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_0_hsmooth = row_p1_p2_hsmooth; + row_m1_0_hdiff = row_p1_p2_hdiff; + } +} + +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M) { + __m256i acc[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * dx_stride]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * dy_stride]); + + acc[0] = _mm256_add_epi32(acc[0], _mm256_madd_epi16(dx_row, dx_row)); + acc[1] = _mm256_add_epi32(acc[1], _mm256_madd_epi16(dx_row, dy_row)); + // Don't compute acc[2], as it should be equal to acc[1] + acc[3] = _mm256_add_epi32(acc[3], _mm256_madd_epi16(dy_row, dy_row)); + } + + // Condense sums + __m256i partial_sum_0 = _mm256_hadd_epi32(acc[0], acc[1]); + __m256i partial_sum_1 = _mm256_hadd_epi32(acc[1], acc[3]); + __m256i result_256 = _mm256_hadd_epi32(partial_sum_0, partial_sum_1); + __m128i result = _mm_add_epi32(_mm256_extracti128_si256(result_256, 0), + _mm256_extracti128_si256(result_256, 1)); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); + + // Convert results to doubles and store + _mm256_storeu_pd(M, _mm256_cvtepi32_pd(result)); +} + +// Try to invert the matrix M +// Note: Due to the nature of how a least-squares matrix is constructed, all of +// the eigenvalues will be >= 0, and therefore det M >= 0 as well. +// The regularization term `+ k * I` further ensures that det M >= k^2. +// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. +// So we don't have to worry about non-invertible matrices here. +static INLINE void invert_2x2(const double *M, double *M_inv) { + double det = (M[0] * M[3]) - (M[1] * M[2]); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M[3] * det_inv; + M_inv[1] = -M[1] * det_inv; + M_inv[2] = -M[2] * det_inv; + M_inv[3] = M[0] * det_inv; +} + +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + DECLARE_ALIGNED(32, double, M[4]); + DECLARE_ALIGNED(32, double, M_inv[4]); + DECLARE_ALIGNED(32, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(32, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + int b[2]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter(src_patch, stride, dx, dy); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); + invert_2x2(M, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, + b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +}
diff --git a/aom_dsp/flow_estimation/x86/disflow_sse4.c b/aom_dsp/flow_estimation/x86/disflow_sse4.c index 3c2159a..e0a4bd0 100644 --- a/aom_dsp/flow_estimation/x86/disflow_sse4.c +++ b/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -1,13 +1,12 @@ /* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * Copyright (c) 2024, Alliance for Open Media. All rights reserved * - * This source code is subject to the terms of the BSD 3-Clause Clear License - * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear - * License was not distributed with this source code in the LICENSE file, you - * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the - * Alliance for Open Media Patent License 1.0 was not distributed with this - * source code in the PATENTS file, you can obtain it at - * aomedia.org/license/patent-license/. + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <assert.h> @@ -20,47 +19,60 @@ #include "config/aom_dsp_rtcd.h" -// Internal cross-check against C code -// If you set this to 1 and compile in debug mode, then the outputs of the two -// convolution stages will be checked against the plain C version of the code, -// and an assertion will be fired if the results differ. -#define CHECK_RESULTS 0 +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_sse4.c if DISFLOW_PATCH_SIZE != 8" +#endif -// Note: Max sum(+ve coefficients) = 1.125 * scale -static INLINE void get_cubic_kernel_dbl(double x, double *kernel) { - // Check that the fractional position is in range. - // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); } -static INLINE void get_cubic_kernel_int(double x, int16_t *kernel) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - -#if CHECK_RESULTS -static INLINE int get_cubic_value_int(const int *p, const int16_t *kernel) { - return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + - kernel[3] * p[3]; -} -#endif // CHECK_RESULTS - // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between @@ -80,10 +92,6 @@ // These will be flattened at the end. __m128i b0_acc = _mm_setzero_si128(); __m128i b1_acc = _mm_setzero_si128(); -#if CHECK_RESULTS - // Also keep a running sum using the C algorithm, for cross-checking - int c_result[2] = { 0 }; -#endif // CHECK_RESULTS // Split offset into integer and fractional parts, and compute cubic // interpolation kernels @@ -92,13 +100,11 @@ const double u_frac = u - floor(u); const double v_frac = v - floor(v); - int16_t h_kernel[4]; - int16_t v_kernel[4]; - get_cubic_kernel_int(u_frac, h_kernel); - get_cubic_kernel_int(v_frac, v_kernel); + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); // Storage for intermediate values between the two convolution directions - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + DECLARE_ALIGNED(16, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]); int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the @@ -121,8 +127,8 @@ // We split the kernel into two vectors with kernel indices: // 0, 1, 0, 1, 0, 1, 0, 1, and // 2, 3, 2, 3, 2, 3, 2, 3 - __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]); - __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]); + __m128i h_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 0)); + __m128i h_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 2)); __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); @@ -141,10 +147,6 @@ __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row); __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4)); - // Relevant multiply instruction - // This multiplies pointwise, then sums in pairs. - //_mm_madd_epi16(); - // Compute first four outputs // input pixels 0, 1, 1, 2, 2, 3, 3, 4 // * kernel 0, 1, 0, 1, 0, 1, 0, 1 @@ -180,43 +182,14 @@ DISFLOW_INTERP_BITS - 6); _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1)); - -#if CHECK_RESULTS && !defined(NDEBUG) - // Cross-check - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - const int x_w = x0 + j; - int arr[4]; - - arr[0] = (int)ref[y_w * stride + (x_w - 1)]; - arr[1] = (int)ref[y_w * stride + (x_w + 0)]; - arr[2] = (int)ref[y_w * stride + (x_w + 1)]; - arr[3] = (int)ref[y_w * stride + (x_w + 2)]; - - // Apply kernel and round, keeping 6 extra bits of precision. - // - // 6 is the maximum allowable number of extra bits which will avoid - // the intermediate values overflowing an int16_t. The most extreme - // intermediate value occurs when: - // * The input pixels are [0, 255, 255, 0] - // * u_frac = 0.5 - // In this case, the un-scaled output is 255 * 1.125 = 286.875. - // As an integer with 6 fractional bits, that is 18360, which fits - // in an int16_t. But with 7 fractional bits it would be 36720, - // which is too large. - const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel), - DISFLOW_INTERP_BITS - 6); - (void)c_value; // Suppress warnings - assert(tmp_row[j] == c_value); - } -#endif // CHECK_RESULTS } // Vertical convolution const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1)); - __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]); - __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]); + __m128i v_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 1)); + __m128i v_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 3)); for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; @@ -259,30 +232,6 @@ __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]); b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt)); b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt)); - -#if CHECK_RESULTS - int16_t dt_arr[8]; - memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr)); - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; - int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], - p[2 * DISFLOW_PATCH_SIZE] }; - const int result = get_cubic_value_int(arr, v_kernel); - - // Apply kernel and round. - // This time, we have to round off the 6 extra bits which were kept - // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits - // of precision to match the scale of the dx and dy arrays. - const int c_warped = ROUND_POWER_OF_TWO(result, round_bits); - const int c_src_px = src[(x + j) + (y + i) * stride] << 3; - const int c_dt = c_warped - c_src_px; - - assert(dt_arr[j] == c_dt); - - c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt; - c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt; - } -#endif // CHECK_RESULTS } // Flatten the two sets of partial sums to find the final value of b @@ -292,156 +241,66 @@ __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc); b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); - -#if CHECK_RESULTS - assert(b[0] == c_result[0]); - assert(b[1] == c_result[1]); -#endif // CHECK_RESULTS } -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m128i row_m1 = _mm_loadu_si128((__m128i *)(src - src_stride - 1)); + __m128i row_m1_a = _mm_cvtepu8_epi16(row_m1); + __m128i row_m1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 1)); + __m128i row_m1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 2)); - // Horizontal filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; + __m128i row_m1_hsmooth = _mm_add_epi16(_mm_add_epi16(row_m1_a, row_m1_c), + _mm_slli_epi16(row_m1_b, 1)); + __m128i row_m1_hdiff = _mm_sub_epi16(row_m1_a, row_m1_c); - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); + __m128i row = _mm_loadu_si128((__m128i *)(src - 1)); + __m128i row_a = _mm_cvtepu8_epi16(row); + __m128i row_b = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); + __m128i row_c = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - __m128i out = _mm_sub_epi16(px0, px2); + __m128i row_hsmooth = + _mm_add_epi16(_mm_add_epi16(row_a, row_c), _mm_slli_epi16(row_b, 1)); + __m128i row_hdiff = _mm_sub_epi16(row_a, row_c); - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); + // Main loop: For each of the 8 output rows: + // * Load row i+1 and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Load row i+1 and apply both horizontal filters + const __m128i row_p1 = + _mm_loadu_si128((__m128i *)(src + (i + 1) * src_stride - 1)); + const __m128i row_p1_a = _mm_cvtepu8_epi16(row_p1); + const __m128i row_p1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 1)); + const __m128i row_p1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 2)); -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } + const __m128i row_p1_hsmooth = _mm_add_epi16( + _mm_add_epi16(row_p1_a, row_p1_c), _mm_slli_epi16(row_p1_b, 1)); + const __m128i row_p1_hdiff = _mm_sub_epi16(row_p1_a, row_p1_c); - // Vertical filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m128i dx_row = + _mm_add_epi16(_mm_add_epi16(row_m1_hdiff, row_p1_hdiff), + _mm_slli_epi16(row_hdiff, 1)); + const __m128i dy_row = _mm_sub_epi16(row_m1_hsmooth, row_p1_hsmooth); - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + _mm_storeu_si128((__m128i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm_storeu_si128((__m128i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS - } -} - -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = _mm_sub_epi16(px0, px2); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_hsmooth = row_hsmooth; + row_m1_hdiff = row_hdiff; + row_hsmooth = row_p1_hsmooth; + row_hdiff = row_p1_hdiff; } } @@ -476,30 +335,6 @@ // which is convenient for integerized SIMD implementation. result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); -#if CHECK_RESULTS - int tmp[4] = { 0 }; - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { - tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; - tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; - // Don't compute tmp[2], as it should be equal to tmp[1] - tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; - } - } - - // Apply regularization - tmp[0] += 1; - tmp[3] += 1; - - tmp[2] = tmp[1]; - - assert(tmp[0] == _mm_extract_epi32(result, 0)); - assert(tmp[1] == _mm_extract_epi32(result, 1)); - assert(tmp[2] == _mm_extract_epi32(result, 2)); - assert(tmp[3] == _mm_extract_epi32(result, 3)); -#endif // CHECK_RESULTS - // Convert results to doubles and store _mm_storeu_pd(M, _mm_cvtepi32_pd(result)); _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8))); @@ -525,16 +360,15 @@ void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { - double M[4]; - double M_inv[4]; + DECLARE_ALIGNED(16, double, M[4]); + DECLARE_ALIGNED(16, double, M_inv[4]); + DECLARE_ALIGNED(16, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(16, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); int b[2]; - int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; - int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; - sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); - sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + sobel_filter(src_patch, stride, dx, dy); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv);
diff --git a/aom_dsp/mathutils.h b/aom_dsp/mathutils.h index cbb6cf4..26635fc 100644 --- a/aom_dsp/mathutils.h +++ b/aom_dsp/mathutils.h
@@ -17,7 +17,6 @@ #include <string.h> #include "aom_dsp/aom_dsp_common.h" -#include "aom_mem/aom_mem.h" static const double TINY_NEAR_ZERO = 1.0E-16;
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c index 065ec9a..947dfd3 100644 --- a/aom_dsp/noise_model.c +++ b/aom_dsp/noise_model.c
@@ -19,6 +19,8 @@ #include "aom_dsp/noise_model.h" #include "aom_dsp/noise_util.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" #define kLowPolyNumParams 3 @@ -1555,7 +1557,7 @@ } static int denoise_and_model_realloc_if_necessary( - struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + struct aom_denoise_and_model_t *ctx, const YV12_BUFFER_CONFIG *sd) { if (ctx->width == sd->y_width && ctx->height == sd->y_height && ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) return 1; @@ -1624,7 +1626,7 @@ // TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer // are null pointers) correctly. int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *sd, + const YV12_BUFFER_CONFIG *sd, aom_film_grain_t *film_grain, int apply_denoise) { const int block_size = ctx->block_size; const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
diff --git a/aom_dsp/noise_model.h b/aom_dsp/noise_model.h index 8228aea..5b2d7ef 100644 --- a/aom_dsp/noise_model.h +++ b/aom_dsp/noise_model.h
@@ -297,14 +297,14 @@ * aom_denoise_and_model_alloc that holds some * buffers for denoising and the current noise * estimate. - * \param[in,out] buf The raw input buffer to be denoised. + * \param[in,out] sd The raw input buffer to be denoised. * \param[out] grain Output film grain parameters * \param[in] apply_denoise Whether or not to apply the denoising to the * frame that will be encoded */ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain, - int apply_denoise); + const YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *grain, int apply_denoise); /*!\brief Allocates a context that can be used for denoising and noise modeling. *
diff --git a/aom_dsp/odintrin.h b/aom_dsp/odintrin.h index 20a7f58..9e4ba50 100644 --- a/aom_dsp/odintrin.h +++ b/aom_dsp/odintrin.h
@@ -70,20 +70,6 @@ #define OD_ARG_NONNULL(x) #endif -/** Copy n elements of memory from src to dst. The 0* term provides - compile-time type checking */ -#if !defined(OVERRIDE_OD_COPY) -#define OD_COPY(dst, src, n) \ - (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src)))) -#endif - -/** Copy n elements of memory from src to dst, allowing overlapping regions. - The 0* term provides compile-time type checking */ -#if !defined(OVERRIDE_OD_MOVE) -# define OD_MOVE(dst, src, n) \ - (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) )) -#endif - /*All of these macros should expect floats as arguments.*/ # define OD_SIGNMASK(a) (-((a) < 0)) # define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c index f71590c..cf0de29 100644 --- a/aom_dsp/psnr.c +++ b/aom_dsp/psnr.c
@@ -349,7 +349,11 @@ int i; uint64_t total_sse = 0; uint32_t total_samples = 0; +#if CONFIG_LIBVMAF_PSNR_PEAK + double peak = (double)(255 << (in_bit_depth - 8)); +#else double peak = (double)((1 << in_bit_depth) - 1); +#endif // CONFIG_LIBVMAF_PSNR_PEAK const unsigned int input_shift = bit_depth - in_bit_depth; for (i = 0; i < 3; ++i) { @@ -384,7 +388,11 @@ // Compute PSNR based on stream bit depth if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { +#if CONFIG_LIBVMAF_PSNR_PEAK + peak = (double)(255 << (bit_depth - 8)); +#else peak = (double)((1 << bit_depth) - 1); +#endif // CONFIG_LIBVMAF_PSNR_PEAK total_sse = 0; total_samples = 0; for (i = 0; i < 3; ++i) {
diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h index 96a17f4..afe6e08 100644 --- a/aom_dsp/psnr.h +++ b/aom_dsp/psnr.h
@@ -31,7 +31,7 @@ /*!\brief Converts SSE to PSNR * - * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). * * \param[in] samples Number of samples * \param[in] peak Max sample value
diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c index 324a18b..05ddbb2 100644 --- a/aom_dsp/pyramid.c +++ b/aom_dsp/pyramid.c
@@ -12,7 +12,7 @@ #include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" // TODO(rachelbarker): Move needed code from av1/ to aom_dsp/ #include "av1/common/resize.h" @@ -26,18 +26,16 @@ // levels. This is counted in the size checked against the max allocation // limit // * Then calls aom_alloc_pyramid() to actually create the pyramid -// * Pyramid is initially marked as invalid (no data) -// * Whenever pyramid is needed, we check the valid flag. If set, use existing -// data. If not set, compute full pyramid -// * Whenever frame buffer is reused, clear the valid flag +// * Pyramid is initially marked as containing no valid data +// * Each pyramid layer is computed on-demand, the first time it is requested +// * Whenever frame buffer is reused, reset the counter of filled levels. +// This invalidates all of the existing pyramid levels. // * Whenever frame buffer is resized, reallocate pyramid -size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, - bool image_is_16bit) { - // Limit number of levels on small frames +size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit) { + // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); - const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - n_levels = AOMMIN(n_levels, max_levels); + const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); size_t alloc_size = 0; alloc_size += sizeof(ImagePyramid); @@ -100,12 +98,10 @@ return alloc_size; } -ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, - bool image_is_16bit) { - // Limit number of levels on small frames +ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit) { + // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); - const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - n_levels = AOMMIN(n_levels, max_levels); + const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr)); if (!pyr) { @@ -118,8 +114,8 @@ return NULL; } - pyr->valid = false; - pyr->n_levels = n_levels; + pyr->max_levels = n_levels; + pyr->filled_levels = 0; // Compute sizes and offsets for each pyramid level // These are gathered up first, so that we can allocate all pyramid levels @@ -248,46 +244,68 @@ } } -// Compute coarse to fine pyramids for a frame +// Compute downsampling pyramid for a frame +// +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. +// +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +// // This must only be called while holding frame_pyr->mutex -static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *frame_pyr) { - int n_levels = frame_pyr->n_levels; +static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *frame_pyr) { + int already_filled_levels = frame_pyr->filled_levels; + + // This condition should already be enforced by aom_compute_pyramid + assert(n_levels <= frame_pyr->max_levels); + + if (already_filled_levels >= n_levels) { + return n_levels; + } + const int frame_width = frame->y_crop_width; const int frame_height = frame->y_crop_height; const int frame_stride = frame->y_stride; assert((frame_width >> n_levels) >= 0); assert((frame_height >> n_levels) >= 0); - PyramidLayer *first_layer = &frame_pyr->layers[0]; - if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { - // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits - assert(first_layer->width == frame_width); - assert(first_layer->height == frame_height); + if (already_filled_levels == 0) { + // Fill in largest level from the original image + PyramidLayer *first_layer = &frame_pyr->layers[0]; + if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { + // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits + assert(first_layer->width == frame_width); + assert(first_layer->height == frame_height); - uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); - uint8_t *pyr_buffer = first_layer->buffer; - int pyr_stride = first_layer->stride; - for (int y = 0; y < frame_height; y++) { - uint16_t *frame_row = frame_buffer + y * frame_stride; - uint8_t *pyr_row = pyr_buffer + y * pyr_stride; - for (int x = 0; x < frame_width; x++) { - pyr_row[x] = frame_row[x] >> (bit_depth - 8); + uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); + uint8_t *pyr_buffer = first_layer->buffer; + int pyr_stride = first_layer->stride; + for (int y = 0; y < frame_height; y++) { + uint16_t *frame_row = frame_buffer + y * frame_stride; + uint8_t *pyr_row = pyr_buffer + y * pyr_stride; + for (int x = 0; x < frame_width; x++) { + pyr_row[x] = frame_row[x] >> (bit_depth - 8); + } } + + fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); + } else { + // For frames stored in an 8-bit buffer, we don't need to copy anything - + // we can just reference the original image buffer + first_layer->buffer = frame->y_buffer; + first_layer->width = frame_width; + first_layer->height = frame_height; + first_layer->stride = frame_stride; } - fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); - } else { - // For frames stored in an 8-bit buffer, we need to configure the first - // pyramid layer to point at the original image buffer - first_layer->buffer = frame->y_buffer; - first_layer->width = frame_width; - first_layer->height = frame_height; - first_layer->stride = frame_stride; + already_filled_levels = 1; } // Fill in the remaining levels through progressive downsampling - for (int level = 1; level < n_levels; ++level) { + for (int level = already_filled_levels; level < n_levels; ++level) { + bool mem_status = false; PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; uint8_t *prev_buffer = prev_layer->buffer; int prev_stride = prev_layer->stride; @@ -298,6 +316,11 @@ int this_height = this_layer->height; int this_stride = this_layer->stride; + // The width and height of the previous layer that needs to be considered to + // derive the current layer frame. + const int input_layer_width = this_width << 1; + const int input_layer_height = this_height << 1; + // Compute the this pyramid level by downsampling the current level. // // We downsample by a factor of exactly 2, clipping the rightmost and @@ -312,13 +335,35 @@ // 2) Up/downsampling by a factor of 2 can be implemented much more // efficiently than up/downsampling by a generic ratio. // TODO(rachelbarker): Use optimized downsample-by-2 function - if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1, - prev_stride, this_buffer, this_height, this_width, - this_stride)) - return false; + + // SIMD support has been added specifically for cases where the downsample + // factor is exactly 2. In such instances, horizontal and vertical resizing + // is performed utilizing the down2_symeven() function, which considers the + // even dimensions of the input layer. + if (should_resize_by_half(input_layer_height, input_layer_width, + this_height, this_width)) { + assert(input_layer_height % 2 == 0 && input_layer_width % 2 == 0 && + "Input width or height cannot be odd."); + mem_status = av1_resize_plane_to_half( + prev_buffer, input_layer_height, input_layer_width, prev_stride, + this_buffer, this_height, this_width, this_stride); + } else { + mem_status = av1_resize_plane(prev_buffer, input_layer_height, + input_layer_width, prev_stride, this_buffer, + this_height, this_width, this_stride); + } + + // Terminate early in cases of memory allocation failure. + if (!mem_status) { + frame_pyr->filled_levels = n_levels; + return -1; + } + fill_border(this_buffer, this_width, this_height, this_stride); } - return true; + + frame_pyr->filled_levels = n_levels; + return n_levels; } // Fill out a downsampling pyramid for a given frame. @@ -327,63 +372,72 @@ // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // -// For small input frames, the number of levels actually constructed -// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE -// pixels along each side. +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. +// No matter how small the frame is, at least one level is guaranteed +// to be filled. // -// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, -// we will still construct the top level. -bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *pyr) { +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *pyr) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex - // before reading or writing the "valid" flag, and hold it while computing - // the pyramid, to ensure proper behaviour if multiple threads call this - // function simultaneously + // before reading or writing the filled_levels field, and hold it while + // computing any additional pyramid levels, to ensure proper behaviour + // when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - if (!pyr->valid) { - pyr->valid = fill_pyramid(frame, bit_depth, pyr); + n_levels = AOMMIN(n_levels, pyr->max_levels); + int result = n_levels; + if (pyr->filled_levels < n_levels) { + // Compute any missing levels that we need + result = fill_pyramid(frame, bit_depth, n_levels, pyr); } - bool valid = pyr->valid; - // At this point, the pyramid is guaranteed to be valid, and can be safely - // read from without holding the mutex any more - + // At this point, as long as result >= 0, the requested number of pyramid + // levels are guaranteed to be valid, and can be safely read from without + // holding the mutex any further + assert(IMPLIES(result >= 0, pyr->filled_levels >= n_levels)); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - return valid; + return result; } #ifndef NDEBUG -// Check if a pyramid has already been computed. +// Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex -// while reading the valid flag, we cannot just write: -// assert(pyr->valid); +// while reading the number of already-computed levels, we cannot just write: +// assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: -// assert(aom_is_pyramid_valid(pyr)); -bool aom_is_pyramid_valid(ImagePyramid *pyr) { +// assert(aom_is_pyramid_valid(pyr, n_levels)); +// +// Note: This deliberately does not restrict n_levels based on the maximum +// number of permitted levels for the frame size. This allows the check to +// catch cases where the caller forgets to handle the case where +// max_levels is less than the requested number of levels +bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex - // before reading or writing the "valid" flag, and hold it while computing - // the pyramid, to ensure proper behaviour if multiple threads call this - // function simultaneously + // before reading or writing the filled_levels field, to ensure proper + // behaviour when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - bool valid = pyr->valid; + bool result = (pyr->filled_levels >= n_levels); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - return valid; + return result; } #endif @@ -394,7 +448,7 @@ #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - pyr->valid = false; + pyr->filled_levels = 0; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD
diff --git a/aom_dsp/pyramid.h b/aom_dsp/pyramid.h index 9442a1f..745bb7e 100644 --- a/aom_dsp/pyramid.h +++ b/aom_dsp/pyramid.h
@@ -19,7 +19,7 @@ #include "config/aom_config.h" #include "aom_scale/yv12config.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,23 +57,31 @@ // same time // // Semantics: - // * This mutex must be held whenever reading or writing the `valid` flag + // * This mutex must be held whenever reading or writing the + // `filled_levels` field // // * This mutex must also be held while computing the image pyramid, // to ensure that only one thread may do so at a time. // - // * However, once you have read the valid flag and seen a true value, - // it is safe to drop the mutex and read from the remaining fields. - // This is because, once the image pyramid is computed, its contents + // * However, once you have read the filled_levels field and observed + // a value N, it is safe to drop the mutex and read from the remaining + // fields, including the first N pyramid levels (but no higher). + // Note that filled_levels must be read once and cached in a local variable + // in order for this to be safe - it cannot be re-read without retaking + // the mutex. + // + // This works because, once the image pyramid is computed, its contents // will not be changed until the parent frame buffer is recycled, // which will not happen until there are no more outstanding references // to the frame buffer. pthread_mutex_t mutex; #endif - // Flag indicating whether the pyramid contains valid data - bool valid; - // Number of allocated/filled levels in this pyramid - int n_levels; + // Maximum number of levels for the given frame size + // We always allocate enough memory for this many levels, as the memory + // cost of higher levels of the pyramid is minimal. + int max_levels; + // Number of levels which currently hold valid data + int filled_levels; // Pointer to allocated buffer uint8_t *buffer_alloc; // Data for each level @@ -82,11 +90,9 @@ PyramidLayer *layers; } ImagePyramid; -size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, - bool image_is_16bit); +size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit); -ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, - bool image_is_16bit); +ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit); // Fill out a downsampling pyramid for a given frame. // @@ -94,23 +100,28 @@ // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // -// For small input frames, the number of levels actually constructed -// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE -// pixels along each side. +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. // -// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, -// we will still construct the top level. -bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *pyr); +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *pyr); #ifndef NDEBUG -// Check if a pyramid has already been computed. +// Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex -// while reading the valid flag, we cannot just write: -// assert(pyr->valid); +// while reading the number of already-computed levels, we cannot just write: +// assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: -// assert(aom_is_pyramid_valid(pyr)); -bool aom_is_pyramid_valid(ImagePyramid *pyr); +// assert(aom_is_pyramid_valid(pyr, n_levels)); +// +// Note: This deliberately does not restrict n_levels based on the maximum +// number of permitted levels for the frame size. This allows the check to +// catch cases where the caller forgets to handle the case where +// max_levels is less than the requested number of levels +bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels); #endif // Mark a pyramid as no longer containing valid data.
diff --git a/aom_dsp/rect.h b/aom_dsp/rect.h deleted file mode 100644 index 11bdaca..0000000 --- a/aom_dsp/rect.h +++ /dev/null
@@ -1,35 +0,0 @@ -/* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_RECT_H_ -#define AOM_AOM_DSP_RECT_H_ - -#include "config/aom_config.h" - -#include <stdbool.h> - -// Struct representing a rectangle of pixels. -// The axes are inclusive-exclusive, ie. the point (top, left) is included -// in the rectangle but (bottom, right) is not. -typedef struct { - int left, right, top, bottom; -} PixelRect; - -static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; } - -static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; } - -static INLINE bool is_inside_rect(const int x, const int y, - const PixelRect *r) { - return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom); -} - -#endif // AOM_AOM_DSP_RECT_H_
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h deleted file mode 100644 index 6488de7..0000000 --- a/aom_dsp/simd/v128_intrinsics_arm.h +++ /dev/null
@@ -1,977 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ - -#include <arm_neon.h> - -#include "config/aom_config.h" - -#include "aom_dsp/simd/v64_intrinsics_arm.h" - -typedef int64x2_t v128; - -SIMD_INLINE uint32_t v128_low_u32(v128 a) { - return v64_low_u32(vget_low_s64(a)); -} - -SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); } - -SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } - -SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } - -SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { - return vcombine_s64(vcreate_s64(b), vcreate_s64(a)); -} - -SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b)); -} - -SIMD_INLINE v128 v128_load_aligned(const void *p) { - return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p)); -} - -SIMD_INLINE v128 v128_load_unaligned(const void *p) { - return v128_load_aligned(p); -} - -SIMD_INLINE void v128_store_aligned(void *p, v128 r) { - vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); -} - -SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { - vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); -} - -SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { -// The following functions require an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - return c ? vreinterpretq_s64_s8( - vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) - : b; -#else - return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c), - v64_align(v128_high_v64(b), v128_low_v64(b), c)) - : v128_from_v64( - v64_align(v128_high_v64(a), v128_low_v64(a), c - 8), - v64_align(v128_low_v64(a), v128_high_v64(b), c - 8)); -#endif -} - -SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); } - -SIMD_INLINE v128 v128_ones(void) { - return vreinterpretq_s64_u8(vdupq_n_u8(-1)); -} - -SIMD_INLINE v128 v128_dup_8(uint8_t x) { - return vreinterpretq_s64_u8(vdupq_n_u8(x)); -} - -SIMD_INLINE v128 v128_dup_16(uint16_t x) { - return vreinterpretq_s64_u16(vdupq_n_u16(x)); -} - -SIMD_INLINE v128 v128_dup_32(uint32_t x) { - return vreinterpretq_s64_u32(vdupq_n_u32(x)); -} - -SIMD_INLINE v128 v128_dup_64(uint64_t x) { - return vreinterpretq_s64_u64(vdupq_n_u64(x)); -} - -SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { - int16x8_t t1 = vmulq_s16( - vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b))))); - int16x8_t t2 = vmulq_s16( - vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b))))); -#if AOM_ARCH_AARCH64 - return vaddlvq_s16(t1) + vaddlvq_s16(t2); -#else - int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2))); - return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0); -#endif -} - -SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { - return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + - v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); -} - -SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { - int64x2_t t = vpaddlq_s32( - vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); - return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0); -} - -SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { -#if AOM_ARCH_AARCH64 - return vaddlvq_u8(vreinterpretq_u8_s64(x)); -#else - uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); - return vget_lane_s32( - vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); -#endif -} - -SIMD_INLINE v128 v128_padd_s16(v128 a) { - return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); -} - -SIMD_INLINE v128 v128_padd_u8(v128 a) { - return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a))); -} - -typedef struct { - sad64_internal hi, lo; -} sad128_internal; - -SIMD_INLINE sad128_internal v128_sad_u8_init(void) { - sad128_internal s; - s.hi = s.lo = vdupq_n_u16(0); - return s; -} - -/* Implementation dependent return value. Result must be finalised with - v128_sad_u8_sum(). - The result for more than 32 v128_sad_u8() calls is undefined. */ -SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { - sad128_internal r; - r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); - r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); - return r; -} - -SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { -#if AOM_ARCH_AARCH64 - return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo); -#else - uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo))); - return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)), - 0); -#endif -} - -typedef struct { - ssd64_internal hi, lo; -} ssd128_internal; - -SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { - ssd128_internal s; - s.hi = s.lo = v64_ssd_u8_init(); - return s; -} - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_u8_sum(). */ -SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { - ssd128_internal r; - r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); - r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); - return r; -} - -SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { - return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo)); -} - -SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); } - -SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); } - -SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); } - -SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); } - -SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); -} - -SIMD_INLINE v128 v128_add_64(v128 x, v128 y) { - return vreinterpretq_s64_u64( - vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y))); -} - -SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); } - -SIMD_INLINE v128 v128_abs_s16(v128 x) { - return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); -} - -SIMD_INLINE v128 v128_abs_s8(v128 x) { - return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x))); -} - -SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { - return vreinterpretq_s64_s32( - vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b))); -} - -SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { - return vreinterpretq_s64_s16( - vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); -} - -SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_s16(vuzp2q_s16( - vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), - vreinterpret_s16_s64(vget_low_s64(b)))), - vreinterpretq_s16_s32( - vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))))); -#else - return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), - v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { - return vreinterpretq_s64_s32( - vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); -} - -SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { -#if AOM_ARCH_AARCH64 - int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), - vreinterpret_s16_s64(vget_low_s64(b))); - int32x4_t t2 = - vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)); - return vreinterpretq_s64_s32(vpaddq_s32(t1, t2)); -#else - return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), - v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { -#if AOM_ARCH_AARCH64 - int16x8_t t1 = vmulq_s16( - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))), - vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b)))); - int16x8_t t2 = vmulq_s16( - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))), - vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b)))); - return vreinterpretq_s64_s16( - vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2))); -#else - return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), - v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE uint32_t v128_movemask_8(v128 a) { - a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0))); -#if AOM_ARCH_AARCH64 - uint8x16_t m = - vandq_u8(vreinterpretq_u8_s64(a), - vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); - return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8); -#else - uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( - vandq_u8(vreinterpretq_u8_s64(a), - vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); - int64x2_t s = vreinterpretq_s64_u64(m); - return v64_low_u32(v64_ziplo_8(vget_high_s64(s), vget_low_s64(s))); -#endif -} - -SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { - c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0))); - return v128_or(v128_and(b, c), v128_andn(a, c)); -} - -SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); - return vreinterpretq_s64_s16(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); - return vreinterpretq_s64_s16(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { - uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); - return vreinterpretq_s64_s32(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); - return vreinterpretq_s64_s32(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { - uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)); - return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { - return v128_from_v64(vget_low_s64(a), vget_low_s64(b)); -} - -SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { - return v128_from_v64(vget_high_s64(a), vget_high_s64(b)); -} - -SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); - return vreinterpretq_s64_u16(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); - return vreinterpretq_s64_u16(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - uint32x4x2_t r = - vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); - return vreinterpretq_s64_u32(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - uint32x4x2_t r = - vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); - return vreinterpretq_s64_u32(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))), - vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))), - vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), - vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))), - vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b)))); -} - -SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { - return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a))); -} - -SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { - return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { - return vreinterpretq_s64_u32( - vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { - return vreinterpretq_s64_s32( - vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { - return vreinterpretq_s64_u32( - vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { - return vreinterpretq_s64_s32( - vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern))); -#else - uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)), - vget_high_u8(vreinterpretq_u8_s64(x)) } }; - uint8x8_t shuffle_hi = - vtbl2_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern))); - uint8x8_t shuffle_lo = - vtbl2_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern))); - return v128_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle_lo), 0)); -#endif -} - -SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() - : vreinterpretq_s64_u8(vshlq_u8(vreinterpretq_u8_s64(a), - vdupq_n_s8((int8_t)c))); -} - -SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() - : vreinterpretq_s64_u8(vshlq_u8(vreinterpretq_u8_s64(a), - vdupq_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { - return (c > 7) ? v128_ones() - : vreinterpretq_s64_s8(vshlq_s8(vreinterpretq_s8_s64(a), - vdupq_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { - return (c > 15) ? v128_zero() - : vreinterpretq_s64_u16(vshlq_u16(vreinterpretq_u16_s64(a), - vdupq_n_s16((int16_t)c))); -} - -SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { - return (c > 15) ? v128_zero() - : vreinterpretq_s64_u16(vshlq_u16(vreinterpretq_u16_s64(a), - vdupq_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { - return (c > 15) ? v128_ones() - : vreinterpretq_s64_s16(vshlq_s16(vreinterpretq_s16_s64(a), - vdupq_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { - return (c > 31) ? v128_zero() - : vreinterpretq_s64_u32(vshlq_u32(vreinterpretq_u32_s64(a), - vdupq_n_s32((int32_t)c))); -} - -SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { - return (c > 31) ? v128_zero() - : vreinterpretq_s64_u32(vshlq_u32(vreinterpretq_u32_s64(a), - vdupq_n_s32(-(int32_t)c))); -} - -SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { - return (c > 31) ? v128_ones() - : vreinterpretq_s64_s32(vshlq_s32(vreinterpretq_s32_s64(a), - vdupq_n_s32(-(int32_t)c))); -} - -SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { - return (c > 63) ? v128_zero() - : vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a), - vdupq_n_s64((int64_t)c))); -} - -SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { - return (c > 63) ? v128_zero() - : vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a), - vdupq_n_s64(-(int64_t)c))); -} - -SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { - return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-(int64_t)c)); -} - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - return n < 8 - ? v128_from_64( - (uint64_t)vorr_u64( - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - n * 8), - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - (8 - n) * 8)), - (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - n * 8)) - : (n == 8 ? v128_from_64( - (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) - : v128_from_64((uint64_t)vshl_n_u64( - vreinterpret_u64_s64(vget_low_s64(a)), - (n - 8) * 8), - 0)); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - return n == 0 - ? a - : (n < 8 - ? v128_from_64( - (uint64_t)vshr_n_u64( - vreinterpret_u64_s64(vget_high_s64(a)), n * 8), - (uint64_t)vorr_u64( - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - n * 8), - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - (8 - n) * 8))) - : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64( - vget_high_s64(a))) - : v128_from_64(0, (uint64_t)vshr_n_u64( - vreinterpret_u64_s64( - vget_high_s64(a)), - (n - 8) * 8)))); -} - -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a; -} - -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a; -} - -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a; -} - -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { - return c ? vshrq_n_s64(a, c) : a; -} - -#else - -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - if (n < 8) - return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n), - v64_shr_n_byte(v128_low_v64(a), 8 - n)), - v64_shl_n_byte(v128_low_v64(a), n)); - else - return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero()); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - if (n < 8) - return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n), - v64_or(v64_shr_n_byte(v128_low_v64(a), n), - v64_shl_n_byte(v128_high_v64(a), 8 - n))); - else - return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8)); -} - -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { - return v128_shl_8(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { - return v128_shr_u8(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { - return v128_shr_s8(a, c); -} - -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { - return v128_shl_16(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { - return v128_shr_u16(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { - return v128_shr_s16(a, c); -} - -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { - return v128_shl_32(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { - return v128_shr_u32(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { - return v128_shr_s32(a, c); -} - -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { - return v128_shl_64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { - return v128_shr_u64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { - return v128_shr_s64(a, c); -} - -#endif - -typedef uint32x4_t sad128_internal_u16; - -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { - return vdupq_n_u32(0); -} - -/* Implementation dependent return value. Result must be finalised with - * v128_sad_u16_sum(). */ -SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, - v128 b) { - return vaddq_u32( - s, vpaddlq_u16(vsubq_u16( - vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)), - vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b))))); -} - -SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { - uint64x2_t t = vpaddlq_u32(s); - return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)), - 0); -} - -typedef v128 ssd128_internal_s16; -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_s16_sum(). */ -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, - v128 b) { - v128 d = v128_sub_16(a, b); - d = v128_madd_s16(d, d); - return v128_add_64( - s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d)))); -} - -SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { - return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); -} - -#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h index 32b51c9..d20f979 100644 --- a/aom_dsp/simd/v128_intrinsics_x86.h +++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -79,7 +79,7 @@ #endif #endif -SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); } +SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); } SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } @@ -345,7 +345,9 @@ typedef v128 sad128_internal; -SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); } +SIMD_INLINE sad128_internal v128_sad_u8_init(void) { + return _mm_setzero_si128(); +} /* Implementation dependent return value. Result must be finalised with v128_sad_sum(). @@ -360,7 +362,7 @@ typedef int32_t ssd128_internal; -SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; } +SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_ssd_sum(). */ @@ -612,7 +614,7 @@ typedef v128 sad128_internal_u16; -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); } +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); } /* Implementation dependent return value. Result must be finalised with * v128_sad_u16_sum(). */ @@ -638,7 +640,7 @@ typedef v128 ssd128_internal_s16; -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } /* Implementation dependent return value. Result must be finalised with * v128_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v256_intrinsics_arm.h b/aom_dsp/simd/v256_intrinsics_arm.h deleted file mode 100644 index bd86ea1..0000000 --- a/aom_dsp/simd/v256_intrinsics_arm.h +++ /dev/null
@@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ - -#include "aom_dsp/simd/v256_intrinsics_v128.h" - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h index 66cfda3..60d0d53 100644 --- a/aom_dsp/simd/v256_intrinsics_c.h +++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -95,7 +95,7 @@ c_v256_store_unaligned(p, a); } -SIMD_INLINE c_v256 c_v256_zero() { +SIMD_INLINE c_v256 c_v256_zero(void) { c_v256 t; t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; return t; @@ -176,7 +176,7 @@ typedef uint32_t c_ssd256_internal; -SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_u8_sum(). */ @@ -929,7 +929,7 @@ typedef uint32_t c_sad256_internal_u16; -SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; } +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with v256_sad_u16_sum(). */ @@ -945,7 +945,7 @@ typedef uint64_t c_ssd256_internal_s16; -SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; } +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h index 4cd83f7..493130d 100644 --- a/aom_dsp/simd/v256_intrinsics_v128.h +++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -15,20 +15,18 @@ #include "config/aom_config.h" #if HAVE_NEON -#include "aom_dsp/simd/v128_intrinsics_arm.h" -#elif HAVE_SSE2 +#error "Do not use this file for Neon" +#endif + +#if HAVE_SSE2 #include "aom_dsp/simd/v128_intrinsics_x86.h" #else #include "aom_dsp/simd/v128_intrinsics.h" #endif -#if HAVE_NEON -typedef int64x2x2_t v256; -#else typedef struct { v128 val[2]; } v256; -#endif SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } @@ -615,33 +613,6 @@ } SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { -#if HAVE_NEON -#if AOM_ARCH_AARCH64 - uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]), - vreinterpretq_u8_s64(x.val[1]) } }; - return v256_from_v128( - vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), - vreinterpretq_s64_u8( - vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); -#else - uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), - vget_high_u8(vreinterpretq_u8_s64(x.val[0])), - vget_low_u8(vreinterpretq_u8_s64(x.val[1])), - vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; - uint8x8_t shuffle1_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1]))); - uint8x8_t shuffle1_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1]))); - uint8x8_t shuffle0_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0]))); - uint8x8_t shuffle0_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))); - return v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0)); -#endif -#else v128 c16 = v128_dup_8(16); v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); v128 masklo = v128_cmplt_s8(pattern.val[0], c16); @@ -650,56 +621,9 @@ v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); -#endif } SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { -#if HAVE_NEON -#if AOM_ARCH_AARCH64 - uint8x16x4_t p = { { - vreinterpretq_u8_s64(y.val[0]), - vreinterpretq_u8_s64(y.val[1]), - vreinterpretq_u8_s64(x.val[0]), - vreinterpretq_u8_s64(x.val[1]), - } }; - return v256_from_v128( - vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), - vreinterpretq_s64_u8( - vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); -#else - v256 c32 = v256_dup_8(32); - v256 p32 = v256_sub_8(pattern, c32); - uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), - vget_high_u8(vreinterpretq_u8_s64(x.val[0])), - vget_low_u8(vreinterpretq_u8_s64(x.val[1])), - vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; - uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])), - vget_high_u8(vreinterpretq_u8_s64(y.val[0])), - vget_low_u8(vreinterpretq_u8_s64(y.val[1])), - vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } }; - uint8x8_t shuffle1_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[1]))); - uint8x8_t shuffle1_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[1]))); - uint8x8_t shuffle0_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[0]))); - uint8x8_t shuffle0_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[0]))); - v256 r1 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0)); - shuffle1_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1]))); - shuffle1_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1]))); - shuffle0_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0]))); - shuffle0_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))); - v256 r2 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0)); - return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32)); -#endif -#else v128 c16 = v128_dup_8(16); v128 c32 = v128_dup_8(32); v128 c48 = v128_dup_8(48); @@ -720,7 +644,6 @@ v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); -#endif } SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h deleted file mode 100644 index f38af80..0000000 --- a/aom_dsp/simd/v64_intrinsics_arm.h +++ /dev/null
@@ -1,679 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ - -#include <arm_neon.h> -#include <string.h> - -#include "config/aom_config.h" - -#include "aom_dsp/simd/v64_intrinsics_arm.h" -#include "aom_ports/arm.h" - -#ifdef AOM_INCOMPATIBLE_GCC -#error Incompatible gcc -#endif - -typedef int64x1_t v64; - -SIMD_INLINE uint32_t v64_low_u32(v64 a) { - return vget_lane_u32(vreinterpret_u32_s64(a), 0); -} - -SIMD_INLINE uint32_t v64_high_u32(v64 a) { - return vget_lane_u32(vreinterpret_u32_s64(a), 1); -} - -SIMD_INLINE int32_t v64_low_s32(v64 a) { - return vget_lane_s32(vreinterpret_s32_s64(a), 0); -} - -SIMD_INLINE int32_t v64_high_s32(v64 a) { - return vget_lane_s32(vreinterpret_s32_s64(a), 1); -} - -SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { - return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | - d); -} - -SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { - return vcreate_s64((uint64_t)x << 32 | y); -} - -SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } - -SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)vget_lane_s64(x, 0); } - -SIMD_INLINE uint32_t u32_load_aligned(const void *p) { - return *((uint32_t *)p); -} - -SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { - return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); -} - -SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { - *((uint32_t *)p) = a; -} - -SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { memcpy(p, &a, 4); } - -SIMD_INLINE v64 v64_load_aligned(const void *p) { - return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); -} - -SIMD_INLINE v64 v64_load_unaligned(const void *p) { - return v64_load_aligned(p); -} - -SIMD_INLINE void v64_store_aligned(void *p, v64 r) { - vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); -} - -SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { - vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); -} - -// The following function requires an immediate. -// Some compilers will check this if it's optimising, others wont. -SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - return c ? vreinterpret_s64_s8( - vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) - : b; -#else - return c ? v64_from_64(((uint64_t)vget_lane_s64(b, 0) >> c * 8) | - ((uint64_t)vget_lane_s64(a, 0) << (8 - c) * 8)) - : b; -#endif -} - -SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); } - -SIMD_INLINE v64 v64_dup_8(uint8_t x) { - return vreinterpret_s64_u8(vdup_n_u8(x)); -} - -SIMD_INLINE v64 v64_dup_16(uint16_t x) { - return vreinterpret_s64_u16(vdup_n_u16(x)); -} - -SIMD_INLINE v64 v64_dup_32(uint32_t x) { - return vreinterpret_s64_u32(vdup_n_u32(x)); -} - -SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { - int16x8_t t = - vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))); -#if AOM_ARCH_AARCH64 - return vaddlvq_s16(t); -#else - int64x2_t r = vpaddlq_s32(vpaddlq_s16(t)); - return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0); -#endif -} - -SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vaddlvq_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -#else - int64x2_t r = - vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0); -#endif -} - -SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { -#if AOM_ARCH_AARCH64 - return vaddlv_u8(vreinterpret_u8_s64(x)); -#else - return vget_lane_u64( - vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))), 0); -#endif -} - -SIMD_INLINE int64_t v64_hadd_s16(v64 a) { - return vget_lane_s64(vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))), 0); -} - -typedef uint16x8_t sad64_internal; - -SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); } - -// Implementation dependent return value. Result must be finalised with -// v64_sad_u8_sum(). -SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { - return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); -} - -SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { -#if AOM_ARCH_AARCH64 - return vaddlvq_u16(s); -#else - uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); - return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(r), vget_low_u64(r)), - 0); -#endif -} - -typedef uint32x4_t ssd64_internal; - -SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); } - -// Implementation dependent return value. Result must be finalised with -// v64_ssd_u8_sum(). -SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { - uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); - return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t))); -} - -SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { -#if AOM_ARCH_AARCH64 - return vaddvq_u32(s); -#else - uint64x2_t t = vpaddlq_u32(s); - return vget_lane_u32( - vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); -#endif -} - -SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } - -SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } - -SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } - -SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } - -SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { - return vreinterpret_s64_u32( - vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); -} - -SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { - return vreinterpret_s64_s32( - vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); -} - -SIMD_INLINE v64 v64_abs_s16(v64 x) { - return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); -} - -SIMD_INLINE v64 v64_abs_s8(v64 x) { - return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); -} - -SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - int16x8_t t = vreinterpretq_s16_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t))); -#else - return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); -#endif -} - -SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { - return vreinterpret_s64_s32( - vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); -} - -SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { - int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); - return vreinterpret_s64_s32( - vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), - vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); -} - -SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { - int16x8_t t = - vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))), - vmovl_s8(vreinterpret_s8_s64(y))); - return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t))); -} - -SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); - return vreinterpret_s64_s16(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); - return vreinterpret_s64_s16(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u32( - vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); -#else - int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); - return vreinterpret_s64_s32(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u32( - vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); -#else - int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); - return vreinterpret_s64_s32(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { - return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { - return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { - return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { - return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); -} - -SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { - return vreinterpret_s64_s16(vqmovn_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); -} - -SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) { - return vreinterpret_s64_u16(vqmovun_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); -} - -SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { - return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); -} - -SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { - return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); -} - -SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpret_s64_u16(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpret_s64_u16(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { - return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { - return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { - return vreinterpret_s64_s32( - vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { - return vreinterpret_s64_u32( - vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); -} - -SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { - return vreinterpret_s64_u8( - vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); -} - -SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { - return vreinterpret_s64_u8( - vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8((int8_t)c))); -} - -SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { - return vreinterpret_s64_u8( - vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { - return vreinterpret_s64_s8( - vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { - return vreinterpret_s64_u16( - vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16((int16_t)c))); -} - -SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { - return vreinterpret_s64_u16( - vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { - return vreinterpret_s64_s16( - vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { - return vreinterpret_s64_u32( - vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32((int32_t)c))); -} - -SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { - return vreinterpret_s64_u32( - vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int32_t)c))); -} - -SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { - return vreinterpret_s64_s32( - vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int32_t)c))); -} - -// The following functions require an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { - return vshl_n_s64(a, c * 8); -} - -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { - return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; -} - -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { - return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a; -} - -#else - -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { - return v64_from_64(v64_u64(a) << c * 8); -} - -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { - return v64_from_64(v64_u64(a) >> c * 8); -} - -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } - -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } - -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } - -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } - -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return v64_shr_u16(a, c); -} - -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return v64_shr_s16(a, c); -} - -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } - -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return v64_shr_u32(a, c); -} - -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return v64_shr_s32(a, c); -} - -#endif - -#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c index f02c307..6cdd584 100644 --- a/aom_dsp/variance.c +++ b/aom_dsp/variance.c
@@ -10,7 +10,6 @@ */ #include <assert.h> #include <stdlib.h> -#include <string.h> #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" @@ -70,12 +69,10 @@ // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass_c( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -100,12 +97,10 @@ // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass_c( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -129,19 +124,19 @@ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t aom_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ @@ -153,10 +148,10 @@ uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ \ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ @@ -170,10 +165,10 @@ uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ \ aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ \ @@ -730,24 +725,24 @@ } } -#define MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ - invert_mask); \ - return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ } MASK_SUBPIX_VAR(4, 4) @@ -924,19 +919,19 @@ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } -#define OBMC_SUBPIX_VAR(W, H) \ - unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ } OBMC_VAR(4, 4)
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c deleted file mode 100644 index b08ec25..0000000 --- a/aom_dsp/x86/aom_asm_stubs.c +++ /dev/null
@@ -1,95 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction aom_filter_block1d16_v8_sse2; -filter8_1dfunction aom_filter_block1d16_h8_sse2; -filter8_1dfunction aom_filter_block1d8_v8_sse2; -filter8_1dfunction aom_filter_block1d8_h8_sse2; -filter8_1dfunction aom_filter_block1d4_v8_sse2; -filter8_1dfunction aom_filter_block1d4_h8_sse2; -filter8_1dfunction aom_filter_block1d16_v4_sse2; -filter8_1dfunction aom_filter_block1d16_h4_sse2; - -filter8_1dfunction aom_filter_block1d8_h4_sse2; -filter8_1dfunction aom_filter_block1d8_v4_sse2; -filter8_1dfunction aom_filter_block1d4_h4_sse2; -filter8_1dfunction aom_filter_block1d4_v4_sse2; - -filter8_1dfunction aom_filter_block1d16_v2_sse2; -filter8_1dfunction aom_filter_block1d16_h2_sse2; -filter8_1dfunction aom_filter_block1d8_v2_sse2; -filter8_1dfunction aom_filter_block1d8_h2_sse2; -filter8_1dfunction aom_filter_block1d4_v2_sse2; -filter8_1dfunction aom_filter_block1d4_h2_sse2; - -// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) - -#if CONFIG_AV1_HIGHBITDEPTH -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; - -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2; - -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; - -// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) -#endif -#endif // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm index d392225..f84f8fa 100644 --- a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm +++ b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -202,14 +202,15 @@ SECTION .text -;void aom_filter_block1d4_v8_sse2 +;void aom_highbd_filter_block1d4_v8_sse2 ;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter +; const uint16_t *src_ptr, +; const ptrdiff_t src_pitch, +; uint16_t *output_ptr, +; ptrdiff_t out_pitch, +; unsigned int output_height, +; const int16_t *filter, +; int bd ;) globalsym(aom_highbd_filter_block1d4_v8_sse2) sym(aom_highbd_filter_block1d4_v8_sse2): @@ -272,14 +273,15 @@ pop rbp ret -;void aom_filter_block1d8_v8_sse2 +;void aom_highbd_filter_block1d8_v8_sse2 ;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter +; const uint16_t *src_ptr, +; const ptrdiff_t src_pitch, +; uint16_t *output_ptr, +; ptrdiff_t out_pitch, +; unsigned int output_height, +; const int16_t *filter, +; int bd ;) globalsym(aom_highbd_filter_block1d8_v8_sse2) sym(aom_highbd_filter_block1d8_v8_sse2): @@ -331,14 +333,15 @@ pop rbp ret -;void aom_filter_block1d16_v8_sse2 +;void aom_highbd_filter_block1d16_v8_sse2 ;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter +; const uint16_t *src_ptr, +; const ptrdiff_t src_pitch, +; uint16_t *output_ptr, +; ptrdiff_t out_pitch, +; unsigned int output_height, +; const int16_t *filter, +; int bd ;) globalsym(aom_highbd_filter_block1d16_v8_sse2) sym(aom_highbd_filter_block1d16_v8_sse2): @@ -394,14 +397,15 @@ pop rbp ret -;void aom_filter_block1d4_h8_sse2 +;void aom_highbd_filter_block1d4_h8_sse2 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter +; const uint16_t *src_ptr, +; const ptrdiff_t src_pitch, +; uint16_t *output_ptr, +; ptrdiff_t out_pitch, +; unsigned int output_height, +; const int16_t *filter, +; int bd ;) globalsym(aom_highbd_filter_block1d4_h8_sse2) sym(aom_highbd_filter_block1d4_h8_sse2): @@ -469,14 +473,15 @@ pop rbp ret -;void aom_filter_block1d8_h8_sse2 +;void aom_highbd_filter_block1d8_h8_sse2 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter +; const uint16_t *src_ptr, +; const ptrdiff_t src_pitch, +; uint16_t *output_ptr, +; ptrdiff_t out_pitch, +; unsigned int output_height, +; const int16_t *filter, +; int bd ;) globalsym(aom_highbd_filter_block1d8_h8_sse2) sym(aom_highbd_filter_block1d8_h8_sse2): @@ -535,14 +540,15 @@ pop rbp ret -;void aom_filter_block1d16_h8_sse2 +;void aom_highbd_filter_block1d16_h8_sse2 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter +; const uint16_t *src_ptr, +; const ptrdiff_t src_pitch, +; uint16_t *output_ptr, +; ptrdiff_t out_pitch, +; unsigned int output_height, +; const int16_t *filter, +; int bd ;) globalsym(aom_highbd_filter_block1d16_h8_sse2) sym(aom_highbd_filter_block1d16_h8_sse2):
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c deleted file mode 100644 index 5c36b68..0000000 --- a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c +++ /dev/null
@@ -1,569 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_dsp_rtcd.h" -#include "aom_dsp/x86/convolve.h" -#include "aom_ports/mem.h" - -void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, - srcRegFilt32b2_2; - __m128i srcReg32b1, srcReg32b2; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); - __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // reading stride of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); - - ss_2 = _mm_srli_si128(srcReg32b2, 2); - ss_4 = _mm_srli_si128(srcReg32b2, 4); - ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_1, secondFilters); - d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); - - ss_1 = _mm_srli_si128(srcReg32b2, 3); - ss_3 = _mm_srli_si128(srcReg32b2, 5); - ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); - - res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - - src_ptr += src_pixels_per_line; - - _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; - __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; - __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); - __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); - __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); - resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); - resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); - __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); - resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); - __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); - resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); - resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); - resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); - resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); - - src_ptr += src_stride; - - _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); - _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg23_hi_1 = resReg45_hi_1; - resReg23_hi_2 = resReg45_hi_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - resReg34_hi_1 = resReg56_hi_1; - resReg34_hi_2 = resReg56_hi_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_3, secondFilters); - d2 = _mm_madd_epi16(ss_5, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg34_lo; - __m128i srcReg45_lo, srcReg56_lo; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_45_lo, resReg34_56_lo; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); - - src_ptr += src_stride; - - _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); - _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - - __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); - __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); - - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23, srcReg34, srcReg45, srcReg56; - __m128i resReg23_34, resReg45_56; - __m128i resReg23_34_45_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); - - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - tmp_0 = _mm_madd_epi16(resReg23, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34, secondFilters); - resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); - __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); - - tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); - resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); - - // shift by 6 bit each 16 bit - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); - resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_34_45_56 = - _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); - - src_ptr += src_stride; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); - *((int *)(output_ptr + out_pitch)) = - _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23 = resReg45; - resReg34 = resReg56; - srcReg4 = srcReg6; - } -}
diff --git a/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_subpixel_8t_sse2.asm deleted file mode 100644 index 640c5b2..0000000 --- a/aom_dsp/x86/aom_subpixel_8t_sse2.asm +++ /dev/null
@@ -1,615 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - punpcklqdq xmm5, xmm4 - punpcklqdq xmm6, xmm7 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm2 - movdqa k5k4, xmm5 - movdqa k6k7, xmm6 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro APPLY_FILTER_4 1 - punpckldq xmm0, xmm1 ;two row in one register - punpckldq xmm6, xmm7 - punpckldq xmm2, xmm3 - punpckldq xmm5, xmm4 - - punpcklbw xmm0, zero ;unpack to word - punpcklbw xmm6, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - - pmullw xmm0, k0k1 ;multiply the filter factors - pmullw xmm6, k6k7 - pmullw xmm2, k2k3 - pmullw xmm5, k5k4 - - paddsw xmm0, xmm6 ;sum - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - psrldq xmm2, 8 - paddsw xmm0, xmm5 - psrldq xmm5, 8 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 -%endm - -%macro GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm0 - punpcklwd xmm1, xmm1 - punpcklwd xmm2, xmm2 - punpcklwd xmm3, xmm3 - punpckhwd xmm4, xmm4 - punpckhwd xmm5, xmm5 - punpckhwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movdqa k0, xmm0 ;store filter factors on stack - movdqa k1, xmm1 - movdqa k2, xmm2 - movdqa k3, xmm3 - movdqa k4, xmm4 - movdqa k5, xmm5 - movdqa k6, xmm6 - movdqa k7, xmm7 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro LOAD_VERT_8 1 - movq xmm0, [rsi + %1] ;0 - movq xmm1, [rsi + rax + %1] ;1 - movq xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2 + %1] ;7 - movq xmm2, [rsi + rax + %1] ;2 - movq xmm3, [rsi + rax * 2 + %1] ;3 - movq xmm4, [rsi + rdx + %1] ;4 - movq xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro APPLY_FILTER_8 2 - punpcklbw xmm0, zero - punpcklbw xmm1, zero - punpcklbw xmm6, zero - punpcklbw xmm7, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - punpcklbw xmm3, zero - punpcklbw xmm4, zero - - pmullw xmm0, k0 - pmullw xmm1, k1 - pmullw xmm6, k6 - pmullw xmm7, k7 - pmullw xmm2, k2 - pmullw xmm5, k5 - pmullw xmm3, k3 - pmullw xmm4, k4 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm6 - paddsw xmm0, xmm7 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm3 - paddsw xmm0, xmm4 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi + %2] - pavgb xmm0, xmm1 -%endif - movq [rdi + %2], xmm0 -%endm - -SECTION .text - -;void aom_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_v8_sse2) -sym(aom_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_v8_sse2) -sym(aom_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_v8_sse2) -sym(aom_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 0, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_h8_sse2) -sym(aom_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_h8_sse2) -sym(aom_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_h8_sse2) -sym(aom_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret
diff --git a/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm deleted file mode 100644 index 90dd55a..0000000 --- a/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm +++ /dev/null
@@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklqdq xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - pxor xmm2, xmm2 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - - punpckldq xmm0, xmm1 ;two row in one register - punpcklbw xmm0, xmm2 ;unpack to word - pmullw xmm0, xmm4 ;multiply the filter factors - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - - paddsw xmm0, xmm3 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - - pshuflw xmm6, xmm7, 11111111b ;k3 - pshufhw xmm7, xmm7, 0b ;k4 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - pxor xmm5, xmm5 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - pmullw xmm2, xmm6 - pmullw xmm3, xmm7 - - paddsw xmm0, xmm1 - paddsw xmm2, xmm3 - - paddsw xmm0, xmm4 ;rounding - paddsw xmm2, xmm4 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -SECTION .text - -globalsym(aom_filter_block1d4_v2_sse2) -sym(aom_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_v2_sse2) -sym(aom_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_v2_sse2) -sym(aom_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d4_h2_sse2) -sym(aom_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_h2_sse2) -sym(aom_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_h2_sse2) -sym(aom_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c index 9ab9143..7ff2801 100644 --- a/aom_dsp/x86/avg_intrin_sse2.c +++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/bitdepth_conversion_sse2.h" #include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, @@ -133,7 +134,7 @@ return (avg + 32) >> 6; } -void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { +static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { __m128i sum0, sum1, s0, s1, s2, s3, u0; u0 = _mm_setzero_si128(); s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0); @@ -171,10 +172,8 @@ __m128i s0, s1, u0; unsigned int avg = 0; u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)), - _mm_cvtsi32_si128(*(const int *)(s + p))); - s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)), - _mm_cvtsi32_si128(*(const int *)(s + p * 3))); + s0 = _mm_unpacklo_epi32(xx_loadl_32(s), xx_loadl_32(s + p)); + s1 = _mm_unpacklo_epi32(xx_loadl_32(s + p * 2), xx_loadl_32(s + p * 3)); s0 = _mm_sad_epu8(s0, u0); s1 = _mm_sad_epu8(s1, u0); s0 = _mm_add_epi16(s0, s1);
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c index 58a7345..9a10e86 100644 --- a/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1120,14 +1120,12 @@ const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { // Load 4 pixels from each of 4 rows from each source - const __m128i s0a = - _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride)); - const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride), - *(int64_t *)(src0 + 3 * src0_stride)); - const __m128i s1a = - _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride)); - const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride), - *(int64_t *)(src1 + 3 * src1_stride)); + const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride); + const __m128i s0b = + xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride); + const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride); + const __m128i s1b = + xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride); // Generate the inverse masks const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h index b4ff697..4ca214f 100644 --- a/aom_dsp/x86/convolve.h +++ b/aom_dsp/x86/convolve.h
@@ -14,6 +14,7 @@ #include <assert.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h"
diff --git a/aom_dsp/x86/fwd_txfm_impl_sse2.h b/aom_dsp/x86/fwd_txfm_impl_sse2.h index 7ee8ba3..e1db3b9 100644 --- a/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ b/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -30,6 +30,7 @@ #define SUB_EPI16 _mm_sub_epi16 #endif +#if defined(FDCT4x4_2D_HELPER) static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, __m128i *in1) { // Constants @@ -185,7 +186,9 @@ } } } +#endif // defined(FDCT4x4_2D_HELPER) +#if defined(FDCT4x4_2D) void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given @@ -205,13 +208,16 @@ storeu_output(&in0, output + 0 * 4); storeu_output(&in1, output + 2 * 4); } +#endif // defined(FDCT4x4_2D) +#if defined(FDCT4x4_2D_LP) void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { __m128i in0, in1; FDCT4x4_2D_HELPER(input, stride, &in0, &in1); _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); } +#endif // defined(FDCT4x4_2D_LP) #if CONFIG_INTERNAL_STATS void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c index 8361e2f..11e4577 100644 --- a/aom_dsp/x86/highbd_convolve_avx2.c +++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -11,7 +11,7 @@ #include <immintrin.h> #include <string.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve.h" #include "aom_dsp/x86/convolve_avx2.h"
diff --git a/aom_dsp/x86/highbd_convolve_sse2.c b/aom_dsp/x86/highbd_convolve_sse2.c index a2bb283..40201aa 100644 --- a/aom_dsp/x86/highbd_convolve_sse2.c +++ b/aom_dsp/x86/highbd_convolve_sse2.c
@@ -15,10 +15,9 @@ // ----------------------------------------------------------------------------- -void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d4_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg34_lo; @@ -101,10 +100,9 @@ } } -void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d4_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i addFilterReg64; __m128i secondFilters, thirdFilters; @@ -153,10 +151,9 @@ } } -void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d8_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; @@ -262,10 +259,9 @@ } } -void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d8_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i addFilterReg64; __m128i secondFilters, thirdFilters; @@ -330,22 +326,57 @@ } } -void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d16_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, height, filter, bd); aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), dst_pitch, height, filter, bd); } -void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d16_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, height, filter, bd); aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), dst_pitch, height, filter, bd); } + +// From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; + +// From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; + +// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
diff --git a/aom_dsp/x86/highbd_convolve_ssse3.c b/aom_dsp/x86/highbd_convolve_ssse3.c index 21389db..31c3c31 100644 --- a/aom_dsp/x86/highbd_convolve_ssse3.c +++ b/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -12,7 +12,7 @@ #include <tmmintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_common_intrin.h"
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c index a5c450a..3b0c42c 100644 --- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -14,6 +14,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c index 6c78eee..8b3045a 100644 --- a/aom_dsp/x86/highbd_sad_avx2.c +++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -551,7 +551,7 @@ static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, uint32_t *res) { __m256i u0, u1, u2, u3; - const __m256i mask = yy_set1_64_from_32i(~0); + const __m256i mask = _mm256_set1_epi64x(~0u); __m128i sad; // 8 32-bit summation
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c index b4ff91d..21e9e8b 100644 --- a/aom_dsp/x86/highbd_variance_avx2.c +++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -618,9 +618,9 @@ return (var > 0) ? var : 0; } -void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); for (int i = 0; i < 8; i += 2) { @@ -653,9 +653,9 @@ *sse = _mm_extract_epi32(v_d, 1); } -void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi16(1); @@ -703,19 +703,19 @@ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } -#define VAR_FN(w, h, block_size, shift) \ - uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_avx2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + highbd_calc##block_size##x##block_size##var_avx2, \ + block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14) @@ -741,6 +741,17 @@ #undef VAR_FN +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + highbd_calc16x16var_avx2, 16); + return *sse; +} + #define SSE2_HEIGHT(H) \ uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ @@ -749,7 +760,7 @@ SSE2_HEIGHT(8) SSE2_HEIGHT(16) -#undef SSE2_Height +#undef SSE2_HEIGHT #define HIGHBD_SUBPIX_VAR(W, H) \ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ @@ -782,8 +793,8 @@ #undef HIGHBD_SUBPIX_VAR -uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; __m256i src0_8x16, src1_8x16, src_16x16; @@ -840,8 +851,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m256i src0_8x16, src1_8x16, src_16x16; __m256i dst0_8x16, dst1_8x16, dst_16x16; @@ -897,8 +908,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } }
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c index e897aab..2fc2e1c 100644 --- a/aom_dsp/x86/highbd_variance_sse2.c +++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -637,8 +637,8 @@ } } -uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16; __m128i src_8x16; @@ -682,8 +682,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i src_8x16; __m128i dst_8x16; @@ -728,8 +728,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } }
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c index 621ef7a..242a548 100644 --- a/aom_dsp/x86/intrapred_avx2.c +++ b/aom_dsp/x86/intrapred_avx2.c
@@ -11,7 +11,7 @@ #include <immintrin.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/intrapred_x86.h" #include "aom_dsp/x86/intrapred_utils.h" #include "aom_dsp/x86/lpf_common_sse2.h"
diff --git a/aom_dsp/x86/intrapred_sse4.c b/aom_dsp/x86/intrapred_sse4.c index fb30420..9de8bf3 100644 --- a/aom_dsp/x86/intrapred_sse4.c +++ b/aom_dsp/x86/intrapred_sse4.c
@@ -12,7 +12,7 @@ #include <emmintrin.h> // SSE2 #include <smmintrin.h> /* SSE4.1 */ -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/intrapred_x86.h" #include "aom_dsp/x86/intrapred_utils.h" #include "aom_dsp/x86/lpf_common_sse2.h"
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c index fd48260..869f880 100644 --- a/aom_dsp/x86/intrapred_ssse3.c +++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -940,10 +940,10 @@ return _mm_unpacklo_epi16((x), _mm_setzero_si128()); } -void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, - const uint8_t *LIBAOM_RESTRICT top_row, - const uint8_t *LIBAOM_RESTRICT left_column, int width, - int height) { +static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column, + int width, int height) { const uint8_t *const sm_weights_h = smooth_weights + height - 4; const uint8_t *const sm_weights_w = smooth_weights + width - 4; const __m128i zero = _mm_setzero_si128();
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c index dd798ca..ed5b580 100644 --- a/aom_dsp/x86/jnt_variance_ssse3.c +++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -17,16 +17,7 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/synonyms.h" - -void aom_var_filter_block2d_bil_first_pass_ssse3( - const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - -void aom_var_filter_block2d_bil_second_pass_ssse3( - const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); +#include "aom_dsp/x86/variance_impl_ssse3.h" static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, const __m128i *w, const __m128i *r,
diff --git a/aom_dsp/x86/masked_sad4d_ssse3.c b/aom_dsp/x86/masked_sad4d_ssse3.c index 799ce9e..d96a9dd 100644 --- a/aom_dsp/x86/masked_sad4d_ssse3.c +++ b/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -103,11 +103,12 @@ pred = _mm_packus_epi16(pred_l, pred_r); \ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); -void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_array[4], int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height, - int inv_mask, unsigned sad_array[4]) { +static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height, int inv_mask, + unsigned sad_array[4]) { const uint8_t *ref0 = ref_array[0]; const uint8_t *ref1 = ref_array[1]; const uint8_t *ref2 = ref_array[2]; @@ -164,11 +165,12 @@ pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); -void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_array[4], int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height, - int inv_mask, unsigned sad_array[4]) { +static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height, int inv_mask, + unsigned sad_array[4]) { const uint8_t *ref0 = ref_array[0]; const uint8_t *ref1 = ref_array[1]; const uint8_t *ref2 = ref_array[2]; @@ -224,22 +226,22 @@ msk_stride, m, n, inv_mask, sad_array); \ } -#define MASKSAD8XN_SSSE3(n) \ - void aom_masked_sad8x##n##x4d_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref[4], \ - int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ - int msk_stride, int inv_mask, unsigned sad_array[4]) { \ - aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - 8, msk, msk_stride, n, inv_mask, sad_array); \ +#define MASKSAD8XN_SSSE3(n) \ + void aom_masked_sad8x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \ + msk, msk_stride, n, inv_mask, sad_array); \ } -#define MASKSAD4XN_SSSE3(n) \ - void aom_masked_sad4x##n##x4d_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref[4], \ - int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ - int msk_stride, int inv_mask, unsigned sad_array[4]) { \ - aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - 4, msk, msk_stride, n, inv_mask, sad_array); \ +#define MASKSAD4XN_SSSE3(n) \ + void aom_masked_sad4x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \ + msk, msk_stride, n, inv_mask, sad_array); \ } MASKSADMXN_SSSE3(128, 128)
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c index 2c02255..f3751c7 100644 --- a/aom_dsp/x86/masked_sad_intrin_avx2.c +++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -9,7 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include <tmmintrin.h> +#include <immintrin.h> #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h"
diff --git a/aom_dsp/x86/obmc_intrinsic_sse4.h b/aom_dsp/x86/obmc_intrinsic_sse4.h index 210f466..fbed235 100644 --- a/aom_dsp/x86/obmc_intrinsic_sse4.h +++ b/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -15,6 +15,7 @@ #include <smmintrin.h> #include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, @@ -28,7 +29,7 @@ assert(IS_POWER_OF_TWO(h)); do { - const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n)); + const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
diff --git a/aom_dsp/x86/obmc_sad_avx2.c b/aom_dsp/x86/obmc_sad_avx2.c index 2aa2a05..9d1b7d4 100644 --- a/aom_dsp/x86/obmc_sad_avx2.c +++ b/aom_dsp/x86/obmc_sad_avx2.c
@@ -13,6 +13,7 @@ #include <immintrin.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h"
diff --git a/aom_dsp/x86/obmc_sad_sse4.c b/aom_dsp/x86/obmc_sad_sse4.c index 0338a8c..542572c 100644 --- a/aom_dsp/x86/obmc_sad_sse4.c +++ b/aom_dsp/x86/obmc_sad_sse4.c
@@ -13,6 +13,7 @@ #include <immintrin.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h"
diff --git a/aom_dsp/x86/obmc_variance_avx2.c b/aom_dsp/x86/obmc_variance_avx2.c index b2df8a9..c23d8c4 100644 --- a/aom_dsp/x86/obmc_variance_avx2.c +++ b/aom_dsp/x86/obmc_variance_avx2.c
@@ -13,6 +13,7 @@ #include <immintrin.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h"
diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c index d3f5f52..164d0c2 100644 --- a/aom_dsp/x86/obmc_variance_sse4.c +++ b/aom_dsp/x86/obmc_variance_sse4.c
@@ -13,6 +13,7 @@ #include <immintrin.h> #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" @@ -21,21 +22,12 @@ #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/obmc_intrinsic_sse4.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/variance_impl_ssse3.h" //////////////////////////////////////////////////////////////////////////////// // 8 bit //////////////////////////////////////////////////////////////////////////////// -void aom_var_filter_block2d_bil_first_pass_ssse3( - const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - -void aom_var_filter_block2d_bil_second_pass_ssse3( - const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum,
diff --git a/aom_dsp/x86/subpel_variance_sse2.asm b/aom_dsp/x86/subpel_variance_ssse3.asm similarity index 97% rename from aom_dsp/x86/subpel_variance_sse2.asm rename to aom_dsp/x86/subpel_variance_ssse3.asm index d1d8373..f424ce0 100644 --- a/aom_dsp/x86/subpel_variance_sse2.asm +++ b/aom_dsp/x86/subpel_variance_ssse3.asm
@@ -15,21 +15,6 @@ SECTION_RODATA pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 bilin_filter_m_ssse3: times 8 db 16, 0 times 8 db 14, 2 @@ -109,9 +94,6 @@ %if cpuflag(ssse3) %define bilin_filter_m bilin_filter_m_ssse3 %define filter_idx_shift 4 -%else -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 %endif ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses ; 11, not 13, if the registers are ordered correctly. May make a minor speed @@ -1449,21 +1431,11 @@ ; location in the sse/2 version, rather than duplicating that code in the ; binary. -INIT_XMM sse2 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - INIT_XMM ssse3 SUBPEL_VARIANCE 4 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 -INIT_XMM sse2 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 - INIT_XMM ssse3 SUBPEL_VARIANCE 4, 1 SUBPEL_VARIANCE 8, 1
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c index 89b9b82..c748a7d 100644 --- a/aom_dsp/x86/sum_squares_avx2.c +++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -21,7 +21,7 @@ int width, int height) { uint64_t result; __m256i v_acc_q = _mm256_setzero_si256(); - const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0); + const __m256i v_zext_mask_q = _mm256_set1_epi64x(~0u); for (int col = 0; col < height; col += 4) { __m256i v_acc_d = _mm256_setzero_si256(); for (int row = 0; row < width; row += 16) {
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c index cf3ed98..6c34c44 100644 --- a/aom_dsp/x86/sum_squares_sse2.c +++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -84,7 +84,7 @@ src += stride << 2; r += 4; } while (r < height); - const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0); + const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u); __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), _mm_and_si128(v_acc_q, v_zext_mask_q)); v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); @@ -116,7 +116,7 @@ int height) { int r = 0; - const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0); + const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u); __m128i v_acc_q = _mm_setzero_si128(); do { @@ -254,7 +254,7 @@ ////////////////////////////////////////////////////////////////////////////// static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { - const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0); + const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128();
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h index 6744ec5..ae889ad 100644 --- a/aom_dsp/x86/synonyms.h +++ b/aom_dsp/x86/synonyms.h
@@ -12,7 +12,7 @@ #ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ #define AOM_AOM_DSP_X86_SYNONYMS_H_ -#include <immintrin.h> +#include <emmintrin.h> #include <string.h> #include "config/aom_config.h" @@ -46,6 +46,14 @@ return _mm_loadu_si128((const __m128i *)a); } +// Load 64 bits from each of hi and low, and pack into an SSE register +// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate +// the strict aliasing rule, this takes a different approach +static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo), + _mm_loadl_epi64((const __m128i *)hi)); +} + static INLINE void xx_storel_32(void *const a, const __m128i v) { const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); @@ -63,28 +71,6 @@ _mm_storeu_si128((__m128i *)a, v); } -// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio -// compilers. The following function is equivalent to _mm_set_epi64x() -// acting on 32-bit integers. -static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { -#if defined(_MSC_VER) && _MSC_VER < 1900 - return _mm_set_epi32(0, e1, 0, e0); -#else - return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); -#endif -} - -// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio -// compilers. The following function is equivalent to _mm_set1_epi64x() -// acting on a 32-bit integer. -static INLINE __m128i xx_set1_64_from_32i(int32_t a) { -#if defined(_MSC_VER) && _MSC_VER < 1900 - return _mm_set_epi32(0, a, 0, a); -#else - return _mm_set1_epi64x((uint32_t)a); -#endif -} - // Fill an SSE register using an interleaved pair of values, ie. set the // 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering // as when a register is stored to / loaded from memory.
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h index b729e5f..d78f4e6 100644 --- a/aom_dsp/x86/synonyms_avx2.h +++ b/aom_dsp/x86/synonyms_avx2.h
@@ -43,15 +43,14 @@ _mm256_storeu_si256((__m256i *)a, v); } -// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio -// compilers. The following function is equivalent to _mm256_set1_epi64x() -// acting on a 32-bit integer. -static INLINE __m256i yy_set1_64_from_32i(int32_t a) { -#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); -#else - return _mm256_set1_epi64x((uint32_t)a); -#endif +// Fill an AVX register using an interleaved pair of values, ie. set the +// 16 channels to {a, b} repeated 8 times, using the same channel ordering +// as when a register is stored to / loaded from memory. +// +// This is useful for rearranging filter kernels for use with the _mm_madd_epi16 +// instruction +static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) { + return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); } // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c index 046d6f1..0f872fc 100644 --- a/aom_dsp/x86/variance_avx2.c +++ b/aom_dsp/x86/variance_avx2.c
@@ -518,8 +518,8 @@ } } -uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8; __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16; @@ -575,8 +575,9 @@ // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8; __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16; @@ -665,8 +666,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst3_16x8; __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16; @@ -715,8 +716,9 @@ // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8; __m256i dst0_16x16, dst1_16x16; @@ -780,8 +782,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } @@ -795,8 +797,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); - case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); + case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); + case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); default: assert(0 && "unsupported width"); return -1; } }
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c index 9e9e70e..57a1cee 100644 --- a/aom_dsp/x86/variance_impl_avx2.c +++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -648,7 +648,7 @@ #endif #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ - int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + static int sub_pixel_avg_variance32x##height##_imp_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ unsigned int *sse) { \ @@ -876,7 +876,7 @@ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse, \ const uint8_t *sec_ptr) { \ - const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ @@ -899,7 +899,7 @@ const uint8_t *sec_ptr = sec; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ + const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ sec_ptr, w, &sse2); \ dst_ptr += hf * dst_stride; \
diff --git a/aom_dsp/x86/variance_impl_ssse3.c b/aom_dsp/x86/variance_impl_ssse3.c index 6990021..952cca1 100644 --- a/aom_dsp/x86/variance_impl_ssse3.c +++ b/aom_dsp/x86/variance_impl_ssse3.c
@@ -15,6 +15,7 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/variance_impl_ssse3.h" void aom_var_filter_block2d_bil_first_pass_ssse3( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
diff --git a/aom_dsp/x86/variance_impl_ssse3.h b/aom_dsp/x86/variance_impl_ssse3.h new file mode 100644 index 0000000..725b551 --- /dev/null +++ b/aom_dsp/x86/variance_impl_ssse3.h
@@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_ +#define AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_ + +#include <stdint.h> + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +#endif // AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c index faec9cf..610695a 100644 --- a/aom_dsp/x86/variance_sse2.c +++ b/aom_dsp/x86/variance_sse2.c
@@ -403,200 +403,6 @@ return *sse; } -// The 2 unused parameters are place holders for PIC enabled build. -// These definitions are for functions defined in subpel_variance.asm -#define DECL(w, opt) \ - int aom_sub_pixel_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ - void *unused0, void *unused) -#define DECLS(opt) \ - DECL(4, opt); \ - DECL(8, opt); \ - DECL(16, opt) - -DECLS(sse2); -DECLS(ssse3); -#undef DECLS -#undef DECL - -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - /*Avoid overflow in helper by capping height.*/ \ - const int hf = AOMMIN(h, 64); \ - unsigned int sse = 0; \ - int se = 0; \ - for (int i = 0; i < (w / wf); ++i) { \ - const uint8_t *src_ptr = src; \ - const uint8_t *dst_ptr = dst; \ - for (int j = 0; j < (h / hf); ++j) { \ - unsigned int sse2; \ - const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ - &sse2, NULL, NULL); \ - dst_ptr += hf * dst_stride; \ - src_ptr += hf * src_stride; \ - se += se2; \ - sse += sse2; \ - } \ - src += wf; \ - dst += wf; \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ - } - -#if !CONFIG_REALTIME_ONLY -#define FNS(opt) \ - FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ - FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ - FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ - FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ - FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ - FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ - FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) -#else -#define FNS(opt) \ - FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ - FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ - FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) -#endif - -FNS(sse2) -FNS(ssse3) - -#undef FNS -#undef FN - -// The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ - int aom_sub_pixel_avg_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ - void *unused) -#define DECLS(opt) \ - DECL(4, opt); \ - DECL(8, opt); \ - DECL(16, opt) - -DECLS(sse2); -DECLS(ssse3); -#undef DECL -#undef DECLS - -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ - const uint8_t *sec) { \ - /*Avoid overflow in helper by capping height.*/ \ - const int hf = AOMMIN(h, 64); \ - unsigned int sse = 0; \ - int se = 0; \ - for (int i = 0; i < (w / wf); ++i) { \ - const uint8_t *src_ptr = src; \ - const uint8_t *dst_ptr = dst; \ - const uint8_t *sec_ptr = sec; \ - for (int j = 0; j < (h / hf); ++j) { \ - unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ - sec_ptr, w, hf, &sse2, NULL, NULL); \ - dst_ptr += hf * dst_stride; \ - src_ptr += hf * src_stride; \ - sec_ptr += hf * w; \ - se += se2; \ - sse += sse2; \ - } \ - src += wf; \ - dst += wf; \ - sec += wf; \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ - } - -#if !CONFIG_REALTIME_ONLY -#define FNS(opt) \ - FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ - FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ - FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ - FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ - FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ - FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ - FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) -#else -#define FNS(opt) \ - FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ - FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ - FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) -#endif - -FNS(sse2) -FNS(ssse3) - -#undef FNS -#undef FN - static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, const __m128i s1, const __m128i a) { @@ -710,8 +516,8 @@ } } -uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst_16x8; __m128i src0_16x4, src1_16x4, src_16x8; @@ -744,8 +550,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst_8x8, dst_16x8; __m128i src_16x8; @@ -781,8 +587,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } }
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c new file mode 100644 index 0000000..d616f43 --- /dev/null +++ b/aom_dsp/x86/variance_ssse3.c
@@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stddef.h> +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int aom_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#if !CONFIG_REALTIME_ONLY +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) +#else +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) +#endif + +FNS(ssse3) + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#if !CONFIG_REALTIME_ONLY +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) +#else +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) +#endif + +FNS(ssse3) + +#undef FNS +#undef FN
diff --git a/aom_ports/aarch32_cpudetect.c b/aom_ports/aarch32_cpudetect.c index 753f957..809bae5 100644 --- a/aom_ports/aarch32_cpudetect.c +++ b/aom_ports/aarch32_cpudetect.c
@@ -44,7 +44,7 @@ return flags; } -#elif defined(ANDROID_USE_CPU_FEATURES_LIB) +#elif defined(AOM_USE_ANDROID_CPU_FEATURES) static int arm_get_cpu_caps(void) { int flags = 0;
diff --git a/aom_ports/aarch64_cpudetect.c b/aom_ports/aarch64_cpudetect.c index 13299a6..e356763 100644 --- a/aom_ports/aarch64_cpudetect.c +++ b/aom_ports/aarch64_cpudetect.c
@@ -9,8 +9,12 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "config/aom_config.h" + #include "arm_cpudetect.h" +#include "aom_ports/arm.h" + #if defined(__APPLE__) #include <sys/sysctl.h> #endif @@ -85,7 +89,7 @@ return flags; } -#elif defined(ANDROID_USE_CPU_FEATURES_LIB) +#elif defined(AOM_USE_ANDROID_CPU_FEATURES) static int arm_get_cpu_caps(void) { int flags = 0; @@ -104,6 +108,7 @@ #define AOM_AARCH64_HWCAP_CRC32 (1 << 7) #define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20) #define AOM_AARCH64_HWCAP_SVE (1 << 22) +#define AOM_AARCH64_HWCAP2_SVE2 (1 << 1) #define AOM_AARCH64_HWCAP2_I8MM (1 << 13) static int arm_get_cpu_caps(void) { @@ -111,7 +116,7 @@ #if HAVE_ARM_CRC32 || HAVE_NEON_DOTPROD || HAVE_SVE unsigned long hwcap = getauxval(AT_HWCAP); #endif -#if HAVE_NEON_I8MM +#if HAVE_NEON_I8MM || HAVE_SVE2 unsigned long hwcap2 = getauxval(AT_HWCAP2); #endif @@ -130,6 +135,9 @@ #if HAVE_SVE if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE; #endif // HAVE_SVE +#if HAVE_SVE2 + if (hwcap2 & AOM_AARCH64_HWCAP2_SVE2) flags |= HAS_SVE2; +#endif // HAVE_SVE2 return flags; } @@ -189,5 +197,8 @@ if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE; if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE; + // Restrict flags: SVE2 assumes that FEAT_SVE is available. + if (!(flags & HAS_SVE)) flags &= ~HAS_SVE2; + return flags; }
diff --git a/aom_ports/aom_ports.cmake b/aom_ports/aom_ports.cmake index 8fd2ffd..6df2bf0 100644 --- a/aom_ports/aom_ports.cmake +++ b/aom_ports/aom_ports.cmake
@@ -18,7 +18,7 @@ "${AOM_ROOT}/aom_ports/emmintrin_compat.h" "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h" "${AOM_ROOT}/aom_ports/mem_ops_aligned.h" - "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h") + "${AOM_ROOT}/aom_ports/sanitizer.h") list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
diff --git a/aom_ports/arm.h b/aom_ports/arm.h index 853741d..a575108 100644 --- a/aom_ports/arm.h +++ b/aom_ports/arm.h
@@ -29,6 +29,8 @@ #define HAS_NEON_I8MM (1 << 3) // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. #define HAS_SVE (1 << 4) +// Armv9.0-A SVE2 instructions. +#define HAS_SVE2 (1 << 5) int aom_arm_cpu_caps(void);
diff --git a/aom_ports/arm_cpudetect.h b/aom_ports/arm_cpudetect.h index 33c2d1b..2b63942 100644 --- a/aom_ports/arm_cpudetect.h +++ b/aom_ports/arm_cpudetect.h
@@ -32,7 +32,7 @@ #endif #if defined(__ANDROID__) && (__ANDROID_API__ < 18) -#define ANDROID_USE_CPU_FEATURES_LIB 1 +#define AOM_USE_ANDROID_CPU_FEATURES 1 // Use getauxval() when targeting (64-bit) Android with API level >= 18. // getauxval() is supported since Android API level 18 (Android 4.3.) // First Android version with 64-bit support was Android 5.x (API level 21).
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h index 7f4c165..7db4cde 100644 --- a/aom_ports/bitops.h +++ b/aom_ports/bitops.h
@@ -13,12 +13,12 @@ #define AOM_AOM_PORTS_BITOPS_H_ #include <assert.h> +#include <stdint.h> -#include "aom_ports/msvc.h" #include "config/aom_config.h" #ifdef _MSC_VER -#if defined(_M_X64) || defined(_M_IX86) +#if defined(_M_X64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM) #include <intrin.h> #define USE_MSC_INTRINSICS #endif @@ -52,7 +52,6 @@ _BitScanReverse(&first_set_bit, n); return first_set_bit; } -#undef USE_MSC_INTRINSICS #else static INLINE int get_msb(unsigned int n) { int log = 0; @@ -71,6 +70,50 @@ } #endif +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int aom_clzll(uint64_t n) { return __builtin_clzll(n); } +#elif defined(USE_MSC_INTRINSICS) +#if defined(_M_X64) || defined(_M_ARM64) +#pragma intrinsic(_BitScanReverse64) +#endif + +static INLINE int aom_clzll(uint64_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) +#if defined(_M_X64) || defined(_M_ARM64) + const unsigned char bit_set = + _BitScanReverse64(&first_set_bit, (unsigned __int64)n); +#else // !(defined(_M_X64) || defined(_M_ARM64)) + const unsigned long n_hi = (unsigned long)(n >> 32); // NOLINT(runtime/int) + if (n_hi != 0) { + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); + assert(bit_set != 0); + (void)bit_set; + return 31 ^ (int)first_set_bit; + } + const unsigned char bit_set = + _BitScanReverse(&first_set_bit, (unsigned long)n); // NOLINT(runtime/int) +#endif + assert(bit_set != 0); + (void)bit_set; + return 63 ^ (int)first_set_bit; +} +#undef USE_MSC_INTRINSICS +#else +static INLINE int aom_clzll(uint64_t n) { + assert(n != 0); + + int res = 0; + uint64_t high_bit = 1ULL << 63; + while (!(n & high_bit)) { + res++; + n <<= 1; + } + return res; +} +#endif + #ifdef __cplusplus } // extern "C" #endif
diff --git a/aom_ports/mem.h b/aom_ports/mem.h index a70ce82..7718006 100644 --- a/aom_ports/mem.h +++ b/aom_ports/mem.h
@@ -24,7 +24,13 @@ #define DECLARE_ALIGNED(n, typ, val) typ val #endif -#if HAVE_NEON && defined(_MSC_VER) +#if defined(__has_builtin) +#define AOM_HAS_BUILTIN(x) __has_builtin(x) +#else +#define AOM_HAS_BUILTIN(x) 0 +#endif + +#if !AOM_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__) #define __builtin_prefetch(x) #endif
diff --git a/aom_ports/msvc.h b/aom_ports/msvc.h deleted file mode 100644 index e78e605..0000000 --- a/aom_ports/msvc.h +++ /dev/null
@@ -1,75 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_PORTS_MSVC_H_ -#define AOM_AOM_PORTS_MSVC_H_ -#ifdef _MSC_VER - -#include "config/aom_config.h" - -#if _MSC_VER < 1900 // VS2015 provides snprintf -#define snprintf _snprintf -#endif // _MSC_VER < 1900 - -#if _MSC_VER < 1800 // VS2013 provides round -#include <math.h> -static INLINE double round(double x) { - if (x < 0) - return ceil(x - 0.5); - else - return floor(x + 0.5); -} - -static INLINE float roundf(float x) { - if (x < 0) - return (float)ceil(x - 0.5f); - else - return (float)floor(x + 0.5f); -} - -static INLINE long lroundf(float x) { - if (x < 0) - return (long)(x - 0.5f); - else - return (long)(x + 0.5f); -} -#endif // _MSC_VER < 1800 - -#if HAVE_AVX -#include <immintrin.h> -// Note: -// _mm256_insert_epi16 intrinsics is available from vs2017. -// We define this macro for vs2015 and earlier. The -// intrinsics used here are in vs2015 document: -// https://msdn.microsoft.com/en-us/library/hh977022.aspx -// Input parameters: -// a: __m256i, -// d: int16_t, -// indx: imm8 (0 - 15) -#if _MSC_VER <= 1900 -#define _mm256_insert_epi16(a, d, indx) \ - _mm256_insertf128_si256( \ - a, \ - _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \ - indx >> 3) - -static INLINE int _mm256_extract_epi32(__m256i a, const int i) { - return a.m256i_i32[i & 7]; -} -static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) { - __m256i c = a; - c.m256i_i32[i & 7] = b; - return c; -} -#endif // _MSC_VER <= 1900 -#endif // HAVE_AVX -#endif // _MSC_VER -#endif // AOM_AOM_PORTS_MSVC_H_
diff --git a/aom_scale/aom_scale_rtcd.c b/aom_scale/aom_scale_rtcd.c index a04e053..93def35 100644 --- a/aom_scale/aom_scale_rtcd.c +++ b/aom_scale/aom_scale_rtcd.c
@@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_scale_rtcd() { aom_once(setup_rtcd_internal); } +void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl index ae0a856..0d545c2 100644 --- a/aom_scale/aom_scale_rtcd.pl +++ b/aom_scale/aom_scale_rtcd.pl
@@ -10,6 +10,8 @@ ## sub aom_scale_forward_decls() { print <<EOF +#include <stdbool.h> + struct yv12_buffer_config; EOF } @@ -26,17 +28,17 @@ add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width"; } -add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes"; +add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes"; add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes"; -add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; +add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop"; -add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc"; +add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop"; -add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc"; +add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop"; add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2"; add_proto qw/void aom_yv12_partial_coloc_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend"; @@ -47,7 +49,7 @@ add_proto qw/void aom_extend_frame_borders_plane_row/, "const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end"; -add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; +add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, int num_planes"; add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c index 94b400b..ed35bb1 100644 --- a/aom_scale/generic/yv12config.c +++ b/aom_scale/generic/yv12config.c
@@ -11,9 +11,12 @@ #include <assert.h> +#include "config/aom_config.h" + +#include "aom/aom_image.h" #include "aom/internal/aom_image_internal.h" -#include "aom_dsp/pyramid.h" #include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" @@ -60,7 +63,7 @@ const uint64_t uvplane_size, const int aligned_width, const int aligned_height, const int uv_width, const int uv_height, const int uv_stride, const int uv_border_w, const int uv_border_h, - int num_pyramid_levels, int alloc_y_plane_only) { + bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; const uint64_t frame_size = @@ -71,8 +74,8 @@ #if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER // We should only need an 8-bit version of the source frame if we are // encoding in non-realtime mode - (void)num_pyramid_levels; - assert(num_pyramid_levels == 0); + (void)alloc_pyramid; + assert(!alloc_pyramid); #endif // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER #if defined AOM_MAX_ALLOCABLE_MEMORY @@ -80,9 +83,8 @@ uint64_t alloc_size = frame_size; #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY // The size of ybf->y_pyramid - if (num_pyramid_levels > 0) { - alloc_size += aom_get_pyramid_alloc_size( - width, height, num_pyramid_levels, use_highbitdepth); + if (alloc_pyramid) { + alloc_size += aom_get_pyramid_alloc_size(width, height, use_highbitdepth); alloc_size += av1_get_corner_list_size(); } #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY @@ -190,9 +192,8 @@ av1_free_corner_list(ybf->corners); ybf->corners = NULL; } - if (num_pyramid_levels > 0) { - ybf->y_pyramid = aom_alloc_pyramid(width, height, num_pyramid_levels, - use_highbitdepth); + if (alloc_pyramid) { + ybf->y_pyramid = aom_alloc_pyramid(width, height, use_highbitdepth); if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR; ybf->corners = av1_alloc_corner_list(); if (!ybf->corners) return AOM_CODEC_MEM_ERROR; @@ -237,7 +238,7 @@ int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, - int num_pyramid_levels, int alloc_y_plane_only) { + bool alloc_pyramid, int alloc_y_plane_only) { #if CONFIG_SIZE_LIMIT if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return AOM_CODEC_MEM_ERROR; @@ -264,21 +265,20 @@ ybf, width, height, ss_x, ss_y, use_highbitdepth, border, byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, aligned_width, aligned_height, uv_width, uv_height, uv_stride, - uv_border_w, uv_border_h, num_pyramid_levels, alloc_y_plane_only); + uv_border_w, uv_border_h, alloc_pyramid, alloc_y_plane_only); } return AOM_CODEC_MEM_ERROR; } int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, - int byte_alignment, int num_pyramid_levels, + int byte_alignment, bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { aom_free_frame_buffer(ybf); - return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, - use_highbitdepth, border, byte_alignment, - NULL, NULL, NULL, num_pyramid_levels, - alloc_y_plane_only); + return aom_realloc_frame_buffer( + ybf, width, height, ss_x, ss_y, use_highbitdepth, border, + byte_alignment, NULL, NULL, NULL, alloc_pyramid, alloc_y_plane_only); } return AOM_CODEC_MEM_ERROR; }
diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c index 5546112..384b72c 100644 --- a/aom_scale/generic/yv12extend.c +++ b/aom_scale/generic/yv12extend.c
@@ -302,8 +302,10 @@ } void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) { + YV12_BUFFER_CONFIG *dst_ybc, int use_crop) { int row; + int width = use_crop ? src_ybc->y_crop_width : src_ybc->y_width; + int height = use_crop ? src_ybc->y_crop_height : src_ybc->y_height; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; @@ -311,8 +313,8 @@ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); - for (row = 0; row < src_ybc->y_height; ++row) { - memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + for (row = 0; row < height; ++row) { + memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_ybc->y_stride; dst16 += dst_ybc->y_stride; } @@ -320,56 +322,60 @@ } #endif - for (row = 0; row < src_ybc->y_height; ++row) { - memcpy(dst, src, src_ybc->y_width); + for (row = 0; row < height; ++row) { + memcpy(dst, src, width); src += src_ybc->y_stride; dst += dst_ybc->y_stride; } } void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, - YV12_BUFFER_CONFIG *dst_bc) { + YV12_BUFFER_CONFIG *dst_bc, int use_crop) { int row; + int width = use_crop ? src_bc->uv_crop_width : src_bc->uv_width; + int height = use_crop ? src_bc->uv_crop_height : src_bc->uv_height; const uint8_t *src = src_bc->u_buffer; uint8_t *dst = dst_bc->u_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + for (row = 0; row < height; ++row) { + memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst, src, src_bc->uv_width); + for (row = 0; row < height; ++row) { + memcpy(dst, src, width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } } void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, - YV12_BUFFER_CONFIG *dst_bc) { + YV12_BUFFER_CONFIG *dst_bc, int use_crop) { int row; + int width = use_crop ? src_bc->uv_crop_width : src_bc->uv_width; + int height = use_crop ? src_bc->uv_crop_height : src_bc->uv_height; const uint8_t *src = src_bc->v_buffer; uint8_t *dst = dst_bc->v_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + for (row = 0; row < height; ++row) { + memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst, src, src_bc->uv_width); + for (row = 0; row < height; ++row) { + memcpy(dst, src, width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } @@ -491,8 +497,8 @@ } int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border, - int byte_alignment, - int num_pyramid_levels, int num_planes) { + int byte_alignment, bool alloc_pyramid, + int num_planes) { if (ybf) { if (new_border == ybf->border) return 0; YV12_BUFFER_CONFIG new_buf; @@ -500,7 +506,7 @@ const int error = aom_alloc_frame_buffer( &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x, ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border, - byte_alignment, num_pyramid_levels, 0); + byte_alignment, alloc_pyramid, 0); if (error) return error; // Copy image buffer aom_yv12_copy_frame(ybf, &new_buf, num_planes);
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h index f192a30..bc05de2 100644 --- a/aom_scale/yv12config.h +++ b/aom_scale/yv12config.h
@@ -16,6 +16,8 @@ extern "C" { #endif +#include <stdbool.h> + #include "config/aom_config.h" #include "aom/aom_codec.h" @@ -45,18 +47,29 @@ /*!\cond */ union { struct { + // The aligned frame width of luma. + // It is aligned to a multiple of 8: + // y_width = (y_crop_width + 7) & ~7 int y_width; + // The aligned frame width of chroma. + // uv_width = y_width >> subsampling_x int uv_width; }; int widths[2]; }; union { struct { + // The aligned frame height of luma. + // It is aligned to a multiple of 8: + // y_height = (y_crop_height + 7) & ~7 int y_height; + // The aligned frame height of chroma. + // uv_height = y_height >> subsampling_y int uv_height; }; int heights[2]; }; + // The frame size en/decoded by AV1 union { struct { int y_crop_width; @@ -139,7 +152,7 @@ // available return values. int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, - int byte_alignment, int num_pyramid_levels, + int byte_alignment, bool alloc_pyramid, int alloc_y_plane_only); // Updates the yv12 buffer config with the frame buffer. |byte_alignment| must @@ -149,15 +162,11 @@ // to decode the current frame. If cb is NULL, libaom will allocate memory // internally to decode the current frame. // -// If num_pyramid_levels > 0, then an image pyramid will be allocated with -// the specified number of levels. -// -// Any buffer which may become a source or ref frame buffer in the encoder -// must have num_pyramid_levels = cpi->image_pyramid_levels. This will cause -// an image pyramid to be allocated if one is needed. -// -// Any other buffers (in particular, any buffers inside the decoder) -// must have cpi->image_pyramid_levels = 0, as a pyramid is unneeded there. +// If alloc_pyramid is true, then an image pyramid will be allocated +// for use in global motion estimation. This is only needed if this frame +// buffer will be used to store a source frame or a reference frame in +// the encoder. Any other framebuffers (eg, intermediates for filtering, +// or any buffer in the decoder) can set alloc_pyramid = false. // // Returns 0 on success. Returns < 0 on failure. int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, @@ -165,7 +174,7 @@ int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, - int num_pyramid_levels, int alloc_y_plane_only); + bool alloc_pyramid, int alloc_y_plane_only); int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
diff --git a/aom_util/aom_pthread.h b/aom_util/aom_pthread.h new file mode 100644 index 0000000..425a6b0 --- /dev/null +++ b/aom_util/aom_pthread.h
@@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// pthread.h wrapper + +#ifndef AOM_AOM_UTIL_AOM_PTHREAD_H_ +#define AOM_AOM_UTIL_AOM_PTHREAD_H_ + +#include "config/aom_config.h" + +#if CONFIG_MULTITHREAD + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) && !HAVE_PTHREAD_H +// Prevent leaking max/min macros. +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include <errno.h> // NOLINT +#include <process.h> // NOLINT +#include <stddef.h> // NOLINT +#include <windows.h> // NOLINT +typedef HANDLE pthread_t; +typedef int pthread_attr_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREADFN unsigned int __stdcall +#endif +#define THREAD_EXIT_SUCCESS 0 + +static INLINE int pthread_attr_init(pthread_attr_t *attr) { + (void)attr; + return 0; +} + +static INLINE int pthread_attr_destroy(pthread_attr_t *attr) { + (void)attr; + return 0; +} + +static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr, + size_t *stacksize) { + (void)attr; + (void)stacksize; + return EINVAL; +} + +static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr, + size_t stacksize) { + (void)attr; + (void)stacksize; + return EINVAL; +} + +static INLINE int pthread_create(pthread_t *const thread, + const pthread_attr_t *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != + WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + ok = SleepConditionVariableCS(condition, mutex, INFINITE); + return !ok; +} +#else // _WIN32 +#include <pthread.h> // NOLINT +#define THREADFN void * +#define THREAD_EXIT_SUCCESS NULL +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CONFIG_MULTITHREAD + +#endif // AOM_AOM_UTIL_AOM_PTHREAD_H_
diff --git a/aom_util/aom_thread.c b/aom_util/aom_thread.c index 14f19e5..783ffac 100644 --- a/aom_util/aom_thread.c +++ b/aom_util/aom_thread.c
@@ -23,8 +23,11 @@ #include <assert.h> #include <string.h> // for memset() +#include "config/aom_config.h" + #include "aom_mem/aom_mem.h" #include "aom_ports/sanitizer.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #if CONFIG_MULTITHREAD @@ -65,29 +68,30 @@ #endif pthread_mutex_lock(&worker->impl_->mutex_); for (;;) { - while (worker->status_ == OK) { // wait in idling mode + while (worker->status_ == AVX_WORKER_STATUS_OK) { // wait in idling mode pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } - if (worker->status_ == WORK) { - // When worker->status_ is WORK, the main thread doesn't change - // worker->status_ and will wait until the worker changes worker->status_ - // to OK. See change_state(). So the worker can safely call execute() - // without holding worker->impl_->mutex_. When the worker reacquires - // worker->impl_->mutex_, worker->status_ must still be WORK. + if (worker->status_ == AVX_WORKER_STATUS_WORKING) { + // When worker->status_ is AVX_WORKER_STATUS_WORKING, the main thread + // doesn't change worker->status_ and will wait until the worker changes + // worker->status_ to AVX_WORKER_STATUS_OK. See change_state(). So the + // worker can safely call execute() without holding worker->impl_->mutex_. + // When the worker reacquires worker->impl_->mutex_, worker->status_ must + // still be AVX_WORKER_STATUS_WORKING. pthread_mutex_unlock(&worker->impl_->mutex_); execute(worker); pthread_mutex_lock(&worker->impl_->mutex_); - assert(worker->status_ == WORK); - worker->status_ = OK; + assert(worker->status_ == AVX_WORKER_STATUS_WORKING); + worker->status_ = AVX_WORKER_STATUS_OK; // signal to the main thread that we're done (for sync()) pthread_cond_signal(&worker->impl_->condition_); } else { - assert(worker->status_ == NOT_OK); // finish the worker + assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK); // finish the worker break; } } pthread_mutex_unlock(&worker->impl_->mutex_); - return THREAD_RETURN(NULL); // Thread is finished + return THREAD_EXIT_SUCCESS; // Thread is finished } // main thread state control @@ -98,13 +102,13 @@ if (worker->impl_ == NULL) return; pthread_mutex_lock(&worker->impl_->mutex_); - if (worker->status_ >= OK) { + if (worker->status_ >= AVX_WORKER_STATUS_OK) { // wait for the worker to finish - while (worker->status_ != OK) { + while (worker->status_ != AVX_WORKER_STATUS_OK) { pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } // assign new status and release the working thread if needed - if (new_status != OK) { + if (new_status != AVX_WORKER_STATUS_OK) { worker->status_ = new_status; pthread_cond_signal(&worker->impl_->condition_); } @@ -118,21 +122,21 @@ static void init(AVxWorker *const worker) { memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; + worker->status_ = AVX_WORKER_STATUS_NOT_OK; } static int sync(AVxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, OK); + change_state(worker, AVX_WORKER_STATUS_OK); #endif - assert(worker->status_ <= OK); + assert(worker->status_ <= AVX_WORKER_STATUS_OK); return !worker->had_error; } static int reset(AVxWorker *const worker) { int ok = 1; worker->had_error = 0; - if (worker->status_ < OK) { + if (worker->status_ < AVX_WORKER_STATUS_OK) { #if CONFIG_MULTITHREAD worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { @@ -166,7 +170,7 @@ } pthread_mutex_lock(&worker->impl_->mutex_); ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker); - if (ok) worker->status_ = OK; + if (ok) worker->status_ = AVX_WORKER_STATUS_OK; pthread_mutex_unlock(&worker->impl_->mutex_); pthread_attr_destroy(&attr); if (!ok) { @@ -179,12 +183,12 @@ return 0; } #else - worker->status_ = OK; + worker->status_ = AVX_WORKER_STATUS_OK; #endif - } else if (worker->status_ > OK) { + } else if (worker->status_ > AVX_WORKER_STATUS_OK) { ok = sync(worker); } - assert(!ok || (worker->status_ == OK)); + assert(!ok || (worker->status_ == AVX_WORKER_STATUS_OK)); return ok; } @@ -196,7 +200,7 @@ static void launch(AVxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, WORK); + change_state(worker, AVX_WORKER_STATUS_WORKING); #else execute(worker); #endif @@ -205,7 +209,7 @@ static void end(AVxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { - change_state(worker, NOT_OK); + change_state(worker, AVX_WORKER_STATUS_NOT_OK); pthread_join(worker->impl_->thread_, NULL); pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); @@ -213,10 +217,10 @@ worker->impl_ = NULL; } #else - worker->status_ = NOT_OK; + worker->status_ = AVX_WORKER_STATUS_NOT_OK; assert(worker->impl_ == NULL); #endif - assert(worker->status_ == NOT_OK); + assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK); } //------------------------------------------------------------------------------
diff --git a/aom_util/aom_thread.h b/aom_util/aom_thread.h index 0e469c0..80ed314 100644 --- a/aom_util/aom_thread.h +++ b/aom_util/aom_thread.h
@@ -17,171 +17,15 @@ #ifndef AOM_AOM_UTIL_AOM_THREAD_H_ #define AOM_AOM_UTIL_AOM_THREAD_H_ -#include "config/aom_config.h" - #ifdef __cplusplus extern "C" { #endif -#define MAX_NUM_THREADS 64 - -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) && !HAVE_PTHREAD_H -// Prevent leaking max/min macros. -#undef NOMINMAX -#define NOMINMAX -#undef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#include <errno.h> // NOLINT -#include <process.h> // NOLINT -#include <windows.h> // NOLINT -typedef HANDLE pthread_t; -typedef int pthread_attr_t; -typedef CRITICAL_SECTION pthread_mutex_t; - -#if _WIN32_WINNT < 0x0600 -#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. -#endif -typedef CONDITION_VARIABLE pthread_cond_t; - -#ifndef WINAPI_FAMILY_PARTITION -#define WINAPI_PARTITION_DESKTOP 1 -#define WINAPI_FAMILY_PARTITION(x) x -#endif - -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define USE_CREATE_THREAD -#endif - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -// _beginthreadex requires __stdcall -#define THREADFN unsigned int __stdcall -#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) - -static INLINE int pthread_attr_init(pthread_attr_t *attr) { - (void)attr; - return 0; -} - -static INLINE int pthread_attr_destroy(pthread_attr_t *attr) { - (void)attr; - return 0; -} - -static INLINE int pthread_attr_getstacksize(const pthread_attr_t *attr, - size_t *stacksize) { - (void)attr; - (void)stacksize; - return EINVAL; -} - -static INLINE int pthread_attr_setstacksize(pthread_attr_t *attr, - size_t stacksize) { - (void)attr; - (void)stacksize; - return EINVAL; -} - -static INLINE int pthread_create(pthread_t *const thread, - const pthread_attr_t *attr, - unsigned int(__stdcall *start)(void *), - void *arg) { - (void)attr; -#ifdef USE_CREATE_THREAD - *thread = CreateThread(NULL, /* lpThreadAttributes */ - 0, /* dwStackSize */ - start, arg, 0, /* dwStackSize */ - NULL); /* lpThreadId */ -#else - *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ - 0, /* unsigned stack_size */ - start, arg, 0, /* unsigned initflag */ - NULL); /* unsigned *thrdaddr */ -#endif - if (*thread == NULL) return 1; - SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != - WAIT_OBJECT_0 || - CloseHandle(thread) == 0); -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; - InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); - return 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return TryEnterCriticalSection(mutex) ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - EnterCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - LeaveCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - DeleteCriticalSection(mutex); - return 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - (void)condition; - return 0; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - (void)cond_attr; - InitializeConditionVariable(condition); - return 0; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - WakeConditionVariable(condition); - return 0; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - WakeAllConditionVariable(condition); - return 0; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok; - ok = SleepConditionVariableCS(condition, mutex, INFINITE); - return !ok; -} -#else // _WIN32 -#include <pthread.h> // NOLINT -#define THREADFN void * -#define THREAD_RETURN(val) val -#endif - -#endif // CONFIG_MULTITHREAD - // State of the worker thread object typedef enum { - NOT_OK = 0, // object is unusable - OK, // ready to work - WORK // busy finishing the current task + AVX_WORKER_STATUS_NOT_OK = 0, // object is unusable + AVX_WORKER_STATUS_OK, // ready to work + AVX_WORKER_STATUS_WORKING // busy finishing the current task } AVxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as
diff --git a/aom_util/aom_util.cmake b/aom_util/aom_util.cmake index 6bf4faf..d3da550 100644 --- a/aom_util/aom_util.cmake +++ b/aom_util/aom_util.cmake
@@ -13,7 +13,8 @@ endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_ set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1) -list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c" +list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_pthread.h" + "${AOM_ROOT}/aom_util/aom_thread.c" "${AOM_ROOT}/aom_util/aom_thread.h" "${AOM_ROOT}/aom_util/endian_inl.h")
diff --git a/aom_util/debug_util.c b/aom_util/debug_util.c index 7b24550..d0792e3 100644 --- a/aom_util/debug_util.c +++ b/aom_util/debug_util.c
@@ -108,7 +108,7 @@ static int frame_stride = MAX_FRAME_STRIDE; static int frame_height = MAX_FRAME_HEIGHT; static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; -void mismatch_move_frame_idx_w() { +void mismatch_move_frame_idx_w(void) { frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num; if (frame_buf_idx_w == frame_buf_idx_r) { printf("frame_buf overflow\n"); @@ -125,7 +125,7 @@ } } -void mismatch_move_frame_idx_r() { +void mismatch_move_frame_idx_r(void) { if (frame_buf_idx_w == frame_buf_idx_r) { printf("frame_buf underflow\n"); assert(0);
diff --git a/apps/aomdec.c b/apps/aomdec.c index 1efc091..15734cb 100644 --- a/apps/aomdec.c +++ b/apps/aomdec.c
@@ -834,6 +834,8 @@ dx_time += aom_usec_timer_elapsed(&timer); got_data = 0; + // TODO(aomedia:3519): Change the prototype of aom_codec_get_frame_fn_t to + // facilitate error handling. while ((img = aom_codec_get_frame(&decoder, &iter))) { ++frame_out; got_data = 1;
diff --git a/apps/aomenc.c b/apps/aomenc.c index 3e334cb..799fb3a 100644 --- a/apps/aomenc.c +++ b/apps/aomenc.c
@@ -1533,30 +1533,36 @@ if (stream->config.vmaf_model_path) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH, stream->config.vmaf_model_path); + ctx_exit_on_error(&stream->encoder, "Failed to set vmaf model path"); } #endif if (stream->config.partition_info_path) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_PARTITION_INFO_PATH, stream->config.partition_info_path); + ctx_exit_on_error(&stream->encoder, "Failed to set partition info path"); } if (stream->config.enable_rate_guide_deltaq) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_ENABLE_RATE_GUIDE_DELTAQ, stream->config.enable_rate_guide_deltaq); + ctx_exit_on_error(&stream->encoder, "Failed to enable rate guide deltaq"); } if (stream->config.rate_distribution_info) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_RATE_DISTRIBUTION_INFO, stream->config.rate_distribution_info); + ctx_exit_on_error(&stream->encoder, "Failed to set rate distribution info"); } if (stream->config.film_grain_filename) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE, stream->config.film_grain_filename); + ctx_exit_on_error(&stream->encoder, "Failed to set film grain table"); } AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_COLOR_RANGE, stream->config.color_range); + ctx_exit_on_error(&stream->encoder, "Failed to set color range"); #if CONFIG_AV1_DECODER if (global->test_decode != TEST_DECODE_OFF) { @@ -2245,17 +2251,25 @@ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X, input.y4m.dst_c_dec_h >> 1); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling x"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y, input.y4m.dst_c_dec_v >> 1); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling y"); } else if (input.bit_depth == 12 && input.file_type == FILE_TYPE_RAW) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X, stream->chroma_subsampling_x); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling x"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y, stream->chroma_subsampling_y); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling y"); } break; default: break;
diff --git a/av1/av1.cmake b/av1/av1.cmake index 1bb0539..99ce3fb 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -262,21 +262,24 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2 "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h" - "${AOM_ROOT}/av1/common/x86/cdef_block_sse2.c" "${AOM_ROOT}/av1/common/x86/cfl_sse2.c" "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c" "${AOM_ROOT}/av1/common/x86/convolve_sse2.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c" + "${AOM_ROOT}/av1/common/x86/resize_sse2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c") list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c" "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h" - "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c" "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c" "${AOM_ROOT}/av1/common/x86/resize_ssse3.c") +# Fallbacks to support Valgrind on 32-bit x86 +list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3_X86 + "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c") + list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c" "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c" @@ -300,6 +303,7 @@ "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c" + "${AOM_ROOT}/av1/common/x86/resize_avx2.c" "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") @@ -351,28 +355,36 @@ "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON - "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c") + "${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c" + "${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c" + "${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c" + "${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c" + "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c" + "${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c" + "${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c" + "${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c" + "${AOM_ROOT}/av1/encoder/arm/ml_neon.c" + "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c" + "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h" + "${AOM_ROOT}/av1/encoder/arm/quantize_neon.c" + "${AOM_ROOT}/av1/encoder/arm/rdopt_neon.c" + "${AOM_ROOT}/av1/encoder/arm/reconinter_enc_neon.c" + "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c" + "${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD - "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c") + "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SVE + "${AOM_ROOT}/av1/encoder/arm/av1_error_sve.c" + "${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c" + "${AOM_ROOT}/av1/encoder/arm/wedge_utils_sve.c") list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32 - "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c") + "${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON + "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c" "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c" "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h" "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c" @@ -392,17 +404,23 @@ "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD + "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_dotprod.c" "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c" "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM + "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c") list(APPEND AOM_AV1_COMMON_INTRIN_SVE + "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c" "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c") +list(APPEND AOM_AV1_COMMON_INTRIN_SVE2 + "${AOM_ROOT}/av1/common/arm/convolve_sve2.c") + list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c") @@ -441,7 +459,7 @@ "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON - "${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c") + "${AOM_ROOT}/av1/encoder/arm/av1_temporal_denoiser_neon.c") endif() if(CONFIG_AV1_HIGHBITDEPTH) @@ -471,6 +489,10 @@ "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c") + list(APPEND AOM_AV1_COMMON_INTRIN_SVE2 + "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_sve2.c" + "${AOM_ROOT}/av1/common/arm/highbd_convolve_sve2.c") + list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c") @@ -484,9 +506,12 @@ "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON - "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c" - "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c") + "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c" + "${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c" + "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c") + + list(APPEND AOM_AV1_ENCODER_INTRIN_SVE + "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_sve.c") endif() if(CONFIG_ACCOUNTING) @@ -511,6 +536,9 @@ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c" "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c") + list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c") + list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/cnn.c" "${AOM_ROOT}/av1/encoder/cnn.h" @@ -596,6 +624,10 @@ require_compiler_flag_nomsvc("-mssse3" NO) add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SSSE3") + if(AOM_ARCH_X86) + add_intrinsics_object_library("-mssse3" "ssse3_x86" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSSE3_X86") + endif() if(CONFIG_AV1_DECODER) if(AOM_AV1_DECODER_INTRIN_SSSE3) @@ -688,6 +720,15 @@ if(HAVE_SVE) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SVE") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SVE") + endif() + endif() + + if(HAVE_SVE2) + add_intrinsics_object_library("${AOM_SVE2_FLAG}" "sve2" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SVE2") endif() if(HAVE_VSX)
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c index 95f0ce8..690d959 100644 --- a/av1/av1_cx_iface.c +++ b/av1/av1_cx_iface.c
@@ -9,28 +9,37 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <limits.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> -#include "aom_mem/aom_mem.h" #include "config/aom_config.h" #include "config/aom_version.h" -#include "aom_ports/mem_ops.h" - +#include "aom/aomcx.h" #include "aom/aom_encoder.h" +#include "aom/aom_external_partition.h" +#include "aom/aom_image.h" #include "aom/internal/aom_codec_internal.h" - #include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_mem/aom_mem.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" +#include "av1/av1_cx_iface.h" #include "av1/av1_iface_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/common/scale.h" #include "av1/encoder/bitstream.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/ethread.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" +#include "av1/encoder/lookahead.h" #include "av1/encoder/rc_utils.h" #include "av1/arg_defs.h" @@ -564,7 +573,10 @@ ratio->den /= denom; } -// Called by encoder_encode() only. Must not be called by encoder_init(). +// Called by encoder_encode() only. Must not be called by encoder_init() +// because the `error` paramerer will be destroyed by aom_codec_enc_init_ver() +// after encoder_init() returns an error. See the "IMPORTANT" comment in +// aom_codec_enc_init_ver(). static aom_codec_err_t update_error_state( aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { const aom_codec_err_t res = error->error_code; @@ -662,6 +674,7 @@ RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1); + RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); RANGE_CHECK_BOOL(extra_cfg, lossless); @@ -947,7 +960,7 @@ return AOM_CODEC_OK; } -int av1_get_image_bps(const aom_image_t *img) { +static int get_image_bps(const aom_image_t *img) { switch (img->fmt) { case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: @@ -1022,39 +1035,22 @@ } TuneCfg *const tune_cfg = &oxcf->tune_cfg; - FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; - TileConfig *const tile_cfg = &oxcf->tile_cfg; - ResizeCfg *const resize_cfg = &oxcf->resize_cfg; - GFConfig *const gf_cfg = &oxcf->gf_cfg; - PartitionCfg *const part_cfg = &oxcf->part_cfg; - IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg; - TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg; - CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg; - SuperResCfg *const superres_cfg = &oxcf->superres_cfg; - KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; - DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; - RateControlCfg *const rc_cfg = &oxcf->rc_cfg; - QuantizationCfg *const q_cfg = &oxcf->q_cfg; - ColorCfg *const color_cfg = &oxcf->color_cfg; - InputCfg *const input_cfg = &oxcf->input_cfg; - AlgoCfg *const algo_cfg = &oxcf->algo_cfg; - ToolCfg *const tool_cfg = &oxcf->tool_cfg; const int is_vbr = cfg->rc_end_usage == AOM_VBR; @@ -1604,11 +1600,26 @@ bool is_sb_size_changed = false; av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) { - av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf, - is_sb_size_changed); + AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i]; + struct aom_internal_error_info *const error = cpi->common.error; + if (setjmp(error->jmp)) { + error->setjmp = 0; + return error->error_code; + } + error->setjmp = 1; + av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed); + error->setjmp = 0; } if (ctx->ppi->cpi_lap != NULL) { - av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed); + AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap; + struct aom_internal_error_info *const error = cpi_lap->common.error; + if (setjmp(error->jmp)) { + error->setjmp = 0; + return error->error_code; + } + error->setjmp = 1; + av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed); + error->setjmp = 0; } return AOM_CODEC_OK; } @@ -1822,6 +1833,11 @@ va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args); +#if !CONFIG_QUANT_MATRIX + if (extra_cfg.enable_qm) { + ERROR("QM can't be enabled with CONFIG_QUANT_MATRIX=0."); + } +#endif return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) { @@ -2608,10 +2624,22 @@ return AOM_CODEC_OK; } +static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; + cpi->svc.framedrop_mode = CAST(AV1E_SET_SVC_FRAME_DROP_MODE, args); + if (cpi->svc.framedrop_mode != AOM_LAYER_DROP && + cpi->svc.framedrop_mode != AOM_FULL_SUPERFRAME_DROP) + return AOM_CODEC_INVALID_PARAM; + else + return AOM_CODEC_OK; +} + #if !CONFIG_REALTIME_ONLY -aom_codec_err_t av1_create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, - STATS_BUFFER_CTX *stats_buf_context, - int num_lap_buffers) { +static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, + STATS_BUFFER_CTX *stats_buf_context, + int num_lap_buffers) { aom_codec_err_t res = AOM_CODEC_OK; int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); @@ -2768,8 +2796,8 @@ if (!priv->ppi) return AOM_CODEC_MEM_ERROR; #if !CONFIG_REALTIME_ONLY - res = av1_create_stats_buffer(&priv->frame_stats_buffer, - &priv->stats_buf_context, *num_lap_buffers); + res = create_stats_buffer(&priv->frame_stats_buffer, + &priv->stats_buf_context, *num_lap_buffers); if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR; assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); @@ -2818,8 +2846,8 @@ } } -void av1_destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, - FIRSTPASS_STATS *frame_stats_buffer) { +static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, + FIRSTPASS_STATS *frame_stats_buffer) { aom_free(stats_buf_context->total_left_stats); aom_free(stats_buf_context->total_stats); aom_free(frame_stats_buffer); @@ -2880,7 +2908,7 @@ } av1_remove_primary_compressor(ppi); } - av1_destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); + destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); aom_free(ctx); return AOM_CODEC_OK; } @@ -2948,8 +2976,7 @@ if (res == AOM_CODEC_OK) { const size_t uncompressed_frame_sz = ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) * - ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * - av1_get_image_bps(img) / 8; + ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8; // Due to the presence of no-show frames, the ctx->cx_data buffer holds // compressed data corresponding to multiple frames. As no-show frames are @@ -3047,11 +3074,36 @@ ctx->pts_offset = ptsvol; ctx->pts_offset_initialized = 1; } + if (ptsvol < ctx->pts_offset) { + aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } ptsvol -= ctx->pts_offset; + if (ptsvol > INT64_MAX / cpi_data.timestamp_ratio->num) { + aom_internal_error( + &ppi->error, AOM_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } int64_t src_time_stamp = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol); +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (ptsvol > INT64_MAX - (int64_t)duration) { + aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + aom_codec_pts_t pts_end = ptsvol + (int64_t)duration; + if (pts_end > INT64_MAX / cpi_data.timestamp_ratio->num) { + aom_internal_error( + &ppi->error, AOM_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } int64_t src_end_time_stamp = - timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + duration); + timebase_units_to_ticks(cpi_data.timestamp_ratio, pts_end); YV12_BUFFER_CONFIG sd; res = image2yuvconfig(img, &sd); @@ -3085,18 +3137,27 @@ subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames, src_border_in_pixels, cpi->common.features.byte_alignment, ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0), - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } if (!ppi->lookahead) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); for (int i = 0; i < ppi->num_fp_contexts; i++) { - av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, - subsampling_x, subsampling_y); + aom_codec_err_t err = + av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, + subsampling_x, subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, err, + "av1_check_initial_width() failed"); + } } if (cpi_lap != NULL) { - av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x, - subsampling_y); + aom_codec_err_t err = av1_check_initial_width( + cpi_lap, use_highbitdepth, subsampling_x, subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, err, + "av1_check_initial_width() failed"); + } } // Store the original flags in to the frame buffer. Will extract the @@ -3160,17 +3221,6 @@ av1_create_workers(ppi, num_workers); av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS); } -#if CONFIG_MULTITHREAD - if (ppi->p_mt_info.num_workers > 1) { - for (int i = 0; i < ppi->num_fp_contexts; i++) { - av1_init_mt_sync(ppi->parallel_cpi[i], - ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS); - } - if (cpi_lap != NULL) { - av1_init_mt_sync(cpi_lap, 1); - } - } -#endif // CONFIG_MULTITHREAD // Re-allocate thread data if workers for encoder multi-threading stage // exceeds prev_num_enc_workers. @@ -3192,6 +3242,17 @@ if (cpi_lap != NULL) { av1_init_frame_mt(ppi, cpi_lap); } +#if CONFIG_MULTITHREAD + if (ppi->p_mt_info.num_workers > 1) { + for (int i = 0; i < ppi->num_fp_contexts; i++) { + av1_init_mt_sync(ppi->parallel_cpi[i], + ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS); + } + if (cpi_lap != NULL) { + av1_init_mt_sync(cpi_lap, 1); + } + } +#endif // CONFIG_MULTITHREAD // Call for LAP stage if (cpi_lap != NULL) { @@ -3199,11 +3260,8 @@ cpi_lap_data.flush = !img; cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio; const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data); - if (status != -1) { - if (status != AOM_CODEC_OK) { - aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s", - cpi->common.error->detail); - } + if (status > AOM_CODEC_OK) { + aom_internal_error_copy(&ppi->error, cpi_lap->common.error); } av1_post_encode_updates(cpi_lap, &cpi_lap_data); } @@ -3244,16 +3302,20 @@ status = av1_get_compressed_data(cpi, &cpi_data); } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1) { - status = av1_compress_parallel_frames(ppi, &cpi_data); + // In case of an error, longjmp() would be invoked and hence "status" + // is set to AOM_CODEC_OK here. + av1_compress_parallel_frames(ppi, &cpi_data); + status = AOM_CODEC_OK; } else { + // No possibility of failures from this function and hence "status" is + // set to AOM_CODEC_OK here. cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data); status = AOM_CODEC_OK; } } if (status == -1) break; if (status != AOM_CODEC_OK) { - aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s", - cpi->common.error->detail); + aom_internal_error_copy(&ppi->error, cpi->common.error); } if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) { av1_init_sc_decisions(ppi); @@ -3270,7 +3332,7 @@ if (ppi->cpi->oxcf.pass != 1) { ppi->total_time_compress_data += cpi->time_compress_data; ppi->total_recode_hits += cpi->frame_recode_hits; - ppi->total_bytes += cpi->bytes; + ppi->total_bytes += (uint64_t)cpi->bytes; for (int i = 0; i < MAX_MODES; i++) { ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i]; } @@ -3625,8 +3687,8 @@ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; lc->max_q = params->max_quantizers[layer]; lc->min_q = params->min_quantizers[layer]; - lc->scaling_factor_num = params->scaling_factor_num[sl]; - lc->scaling_factor_den = params->scaling_factor_den[sl]; + lc->scaling_factor_num = AOMMAX(1, params->scaling_factor_num[sl]); + lc->scaling_factor_den = AOMMAX(1, params->scaling_factor_den[sl]); const int layer_target_bitrate = params->layer_target_bitrate[layer]; if (layer_target_bitrate > INT_MAX / 1000) { lc->layer_target_bitrate = INT_MAX; @@ -4439,6 +4501,7 @@ { AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr }, { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr }, + { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode }, // Getters { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/av1/av1_cx_iface.h b/av1/av1_cx_iface.h index 05f4901..b2a7005 100644 --- a/av1/av1_cx_iface.h +++ b/av1/av1_cx_iface.h
@@ -20,13 +20,6 @@ AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg); -aom_codec_err_t av1_create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, - STATS_BUFFER_CTX *stats_buf_context, - int num_lap_buffers); - -void av1_destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, - FIRSTPASS_STATS *frame_stats_buffer); - aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool, @@ -37,8 +30,6 @@ void av1_destroy_context_and_bufferpool(AV1_COMP *cpi, BufferPool **p_buffer_pool); -int av1_get_image_bps(const aom_image_t *img); - #ifdef __cplusplus } // extern "C" #endif
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c index 29c63e2..18dc980 100644 --- a/av1/av1_dx_iface.c +++ b/av1/av1_dx_iface.c
@@ -19,18 +19,23 @@ #include "aom/internal/aom_image_internal.h" #include "aom/aomdx.h" #include "aom/aom_decoder.h" +#include "aom/aom_image.h" #include "aom_dsp/bitreader_buffer.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" #include "aom_ports/mem_ops.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" #include "av1/common/frame_buffers.h" #include "av1/common/enums.h" #include "av1/common/obu_util.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodeframe.h" +#include "av1/decoder/dthread.h" #include "av1/decoder/grain_synthesis.h" #include "av1/decoder/obu.h" @@ -305,10 +310,8 @@ return AOM_CODEC_UNSUP_BITSTREAM; } - if (parse_operating_points(&rb, reduced_still_picture_hdr, si) != - AOM_CODEC_OK) { - return AOM_CODEC_ERROR; - } + status = parse_operating_points(&rb, reduced_still_picture_hdr, si); + if (status != AOM_CODEC_OK) return status; int num_bits_width = aom_rb_read_literal(&rb, 4) + 1; int num_bits_height = aom_rb_read_literal(&rb, 4) + 1; @@ -814,102 +817,111 @@ // simply a pointer to an integer index uintptr_t *index = (uintptr_t *)iter; - if (ctx->frame_worker != NULL) { - const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - AVxWorker *const worker = ctx->frame_worker; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - AV1Decoder *const pbi = frame_worker_data->pbi; - AV1_COMMON *const cm = &pbi->common; - CommonTileParams *const tiles = &cm->tiles; - // Wait for the frame from worker thread. - if (winterface->sync(worker)) { - // Check if worker has received any frames. - if (frame_worker_data->received_frame == 1) { - frame_worker_data->received_frame = 0; - check_resync(ctx, frame_worker_data->pbi); - } - YV12_BUFFER_CONFIG *sd; - aom_film_grain_t *grain_params; - if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, - &grain_params) == 0) { - RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; - ctx->last_show_frame = output_frame_buf; - if (ctx->need_resync) return NULL; - aom_img_remove_metadata(&ctx->img); - yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); - move_decoder_metadata_to_img(pbi, &ctx->img); - - if (!pbi->ext_tile_debug && tiles->large_scale) { - *index += 1; // Advance the iterator to point to the next image - aom_img_remove_metadata(&ctx->img); - yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); - move_decoder_metadata_to_img(pbi, &ctx->img); - img = &ctx->img; - return img; - } - - const int num_planes = av1_num_planes(cm); - if (pbi->ext_tile_debug && tiles->single_tile_decoding && - pbi->dec_tile_row >= 0) { - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); - const int mi_row = tile_row * tile_height; - const int ssy = ctx->img.y_chroma_shift; - int plane; - ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; - if (num_planes > 1) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - ctx->img.planes[plane] += - mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; - } - } - ctx->img.d_h = - AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; - } - - if (pbi->ext_tile_debug && tiles->single_tile_decoding && - pbi->dec_tile_col >= 0) { - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); - const int mi_col = tile_col * tile_width; - const int ssx = ctx->img.x_chroma_shift; - const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; - int plane; - ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); - if (num_planes > 1) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - ctx->img.planes[plane] += - mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); - } - } - ctx->img.d_w = - AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; - } - - ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; - img = &ctx->img; - img->temporal_id = output_frame_buf->temporal_id; - img->spatial_id = output_frame_buf->spatial_id; - if (pbi->skip_film_grain) grain_params->apply_grain = 0; - aom_image_t *res = - add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); - if (!res) { - aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, - "Grain systhesis failed\n"); - } - *index += 1; // Advance the iterator to point to the next image - return res; - } - } else { - // Decoding failed. Release the worker thread. - frame_worker_data->received_frame = 0; - ctx->need_resync = 1; - if (ctx->flushed != 1) return NULL; - } + if (ctx->frame_worker == NULL) { + return NULL; } - return NULL; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + pbi->error.error_code = AOM_CODEC_OK; + pbi->error.has_detail = 0; + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + // Wait for the frame from worker thread. + if (!winterface->sync(worker)) { + // Decoding failed. Release the worker thread. + frame_worker_data->received_frame = 0; + ctx->need_resync = 1; + // TODO(aomedia:3519): Set an error code. Check if a different error code + // should be used if ctx->flushed != 1. + return NULL; + } + // Check if worker has received any frames. + if (frame_worker_data->received_frame == 1) { + frame_worker_data->received_frame = 0; + check_resync(ctx, frame_worker_data->pbi); + } + YV12_BUFFER_CONFIG *sd; + aom_film_grain_t *grain_params; + if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) != + 0) { + return NULL; + } + RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; + ctx->last_show_frame = output_frame_buf; + if (ctx->need_resync) return NULL; + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); + move_decoder_metadata_to_img(pbi, &ctx->img); + + if (!pbi->ext_tile_debug && tiles->large_scale) { + *index += 1; // Advance the iterator to point to the next image + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); + move_decoder_metadata_to_img(pbi, &ctx->img); + img = &ctx->img; + return img; + } + + const int num_planes = av1_num_planes(cm); + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_row >= 0) { + int tile_width, tile_height; + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + return NULL; + } + const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); + const int mi_row = tile_row * tile_height; + const int ssy = ctx->img.y_chroma_shift; + int plane; + ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += + mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; + } + } + ctx->img.d_h = + AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; + } + + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_col >= 0) { + int tile_width, tile_height; + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + return NULL; + } + const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); + const int mi_col = tile_col * tile_width; + const int ssx = ctx->img.x_chroma_shift; + const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + int plane; + ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); + } + } + ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; + } + + ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; + img = &ctx->img; + img->temporal_id = output_frame_buf->temporal_id; + img->spatial_id = output_frame_buf->spatial_id; + if (pbi->skip_film_grain) grain_params->apply_grain = 0; + aom_image_t *res = + add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); + if (!res) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.has_detail = 1; + snprintf(pbi->error.detail, sizeof(pbi->error.detail), + "Grain synthesis failed\n"); + return res; + } + *index += 1; // Advance the iterator to point to the next image + return res; } static aom_codec_err_t decoder_set_fb_fn( @@ -917,16 +929,17 @@ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { if (cb_get == NULL || cb_release == NULL) { return AOM_CODEC_INVALID_PARAM; - } else if (ctx->frame_worker == NULL) { + } + if (ctx->frame_worker != NULL) { // If the decoder has already been initialized, do not accept changes to // the frame buffer functions. - ctx->get_ext_fb_cb = cb_get; - ctx->release_ext_fb_cb = cb_release; - ctx->ext_priv = cb_priv; - return AOM_CODEC_OK; + return AOM_CODEC_ERROR; } - return AOM_CODEC_ERROR; + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, @@ -1422,7 +1435,9 @@ (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + return AOM_CODEC_CORRUPT_FRAME; + } *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE; return AOM_CODEC_OK; } else {
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c index 5e6ffc9..e9a38c4 100644 --- a/av1/common/alloccommon.c +++ b/av1/common/alloccommon.c
@@ -13,6 +13,8 @@ #include "config/aom_config.h" #include "aom_mem/aom_mem.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" @@ -20,6 +22,8 @@ #include "av1/common/cdef_block.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" +#include "av1/common/enums.h" +#include "av1/common/restoration.h" #include "av1/common/thread_common.h" int av1_get_MBs(int width, int height) { @@ -99,10 +103,14 @@ if (*cdef_row_mt == NULL) return; #if CONFIG_MULTITHREAD for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { - pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); - pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); - aom_free((*cdef_row_mt)[row_idx].row_mutex_); - aom_free((*cdef_row_mt)[row_idx].row_cond_); + if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) { + pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); + aom_free((*cdef_row_mt)[row_idx].row_mutex_); + } + if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) { + pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); + aom_free((*cdef_row_mt)[row_idx].row_cond_); + } } #else (void)num_mi_rows; @@ -167,7 +175,7 @@ if (*cdef_row_mt != NULL) return; CHECK_MEM_ERROR(cm, *cdef_row_mt, - aom_malloc(sizeof(**cdef_row_mt) * num_mi_rows)); + aom_calloc(num_mi_rows, sizeof(**cdef_row_mt))); #if CONFIG_MULTITHREAD for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_, @@ -177,8 +185,6 @@ CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_, aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_))); pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL); - - (*cdef_row_mt)[row_idx].is_row_done = 0; } #endif // CONFIG_MULTITHREAD } @@ -198,7 +204,7 @@ const int is_num_workers_changed = cdef_info->allocated_num_workers != num_workers; const int is_cdef_enabled = - cm->seq_params->enable_cdef && !cm->tiles.large_scale; + cm->seq_params->enable_cdef && !cm->tiles.single_tile_decoding; // num-bufs=3 represents ping-pong buffers for top linebuf, // followed by bottom linebuf. @@ -466,11 +472,11 @@ mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( mi_grid_size, sizeof(*mi_params->mi_grid_base)); if (!mi_params->mi_grid_base) return 1; - mi_params->mi_grid_size = mi_grid_size; mi_params->tx_type_map = aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); if (!mi_params->tx_type_map) return 1; + mi_params->mi_grid_size = mi_grid_size; } return 0;
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c new file mode 100644 index 0000000..114232d --- /dev/null +++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -0,0 +1,748 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/arm/convolve_scale_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +static INLINE int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter, + const int32x4_t horiz_const) { + int16x4_t filter_lo = vget_low_s16(filter); + int16x4_t filter_hi = vget_high_s16(filter); + + int32x4_t sum = horiz_const; + sum = vmlal_lane_s16(sum, s0, filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, filter_hi, 3); + + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, + const int16x8_t horiz_const) { + int16x4_t filter_lo = vget_low_s16(filter); + int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE void convolve_horiz_scale_8tap_neon(const uint8_t *src, + int src_stride, int16_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter, + const int subpel_x_qn, + const int x_step_qn) { + DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); + const int bd = 8; + + if (w == 4) { + // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. + const int32x4_t horiz_offset = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + + do { + int x_qn = subpel_x_qn; + + // Process a 4x4 tile. + for (int r = 0; r < 4; ++r) { + const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(x_filter + filter_offset); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = + convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); + + vst1_s16(&temp[r * 4], d0); + x_qn += x_step_qn; + } + + // Transpose the 4x4 result tile and store. + int16x4_t d0, d1, d2, d3; + load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. + // The additional -1 is needed because we are halving the filter values. + const int16x8_t horiz_offset = + vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); + + do { + int x_qn = subpel_x_qn; + int16_t *d = dst; + int width = w; + + do { + // Process an 8x8 tile. + for (int r = 0; r < 8; ++r) { + const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + int16x8_t filter = vld1q_s16(x_filter + filter_offset); + // Filter values are all even so halve them to allow convolution + // kernel computations to stay in 16-bit element types. + filter = vshrq_n_s16(filter, 1); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, + &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, + horiz_offset); + + vst1q_s16(&temp[r * 8], d0); + + x_qn += x_step_qn; + } + + // Transpose the 8x8 result tile and store. + int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + d += 8; + width -= 8; + } while (width != 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); + } +} + +static INLINE int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, + const int32x4_t horiz_const) { + int16x4_t filter_lo = vget_low_s16(filter); + int16x4_t filter_hi = vget_high_s16(filter); + + int32x4_t sum = horiz_const; + // Filter values at indices 0 and 7 are 0. + sum = vmlal_lane_s16(sum, s0, filter_lo, 1); + sum = vmlal_lane_s16(sum, s1, filter_lo, 2); + sum = vmlal_lane_s16(sum, s2, filter_lo, 3); + sum = vmlal_lane_s16(sum, s3, filter_hi, 0); + sum = vmlal_lane_s16(sum, s4, filter_hi, 1); + sum = vmlal_lane_s16(sum, s5, filter_hi, 2); + + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, + const int16x8_t horiz_const) { + int16x4_t filter_lo = vget_low_s16(filter); + int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = horiz_const; + // Filter values at indices 0 and 7 are 0. + sum = vmlaq_lane_s16(sum, s0, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s3, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 2); + + // We halved the filter values so -1 from right shift. + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE void convolve_horiz_scale_6tap_neon(const uint8_t *src, + int src_stride, int16_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter, + const int subpel_x_qn, + const int x_step_qn) { + DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); + const int bd = 8; + + if (w == 4) { + // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. + const int32x4_t horiz_offset = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + + do { + int x_qn = subpel_x_qn; + + // Process a 4x4 tile. + for (int r = 0; r < 4; ++r) { + const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(x_filter + filter_offset); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s3 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + int16x4_t d0 = + convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); + + vst1_s16(&temp[r * 4], d0); + x_qn += x_step_qn; + } + + // Transpose the 4x4 result tile and store. + int16x4_t d0, d1, d2, d3; + load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. + // The additional -1 is needed because we are halving the filter values. + const int16x8_t horiz_offset = + vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); + + do { + int x_qn = subpel_x_qn; + int16_t *d = dst; + int width = w; + + do { + // Process an 8x8 tile. + for (int r = 0; r < 8; ++r) { + const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + int16x8_t filter = vld1q_s16(x_filter + filter_offset); + // Filter values are all even so halve them to allow convolution + // kernel computations to stay in 16-bit element types. + filter = vshrq_n_s16(filter, 1); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, + &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int16x8_t d0 = + convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); + + vst1q_s16(&temp[r * 8], d0); + + x_qn += x_step_qn; + } + + // Transpose the 8x8 result tile and store. + int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + d += 8; + width -= 8; + } while (width != 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); + } +} + +static INLINE void convolve_horiz_scale_2_8tap_neon( + const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter) { + const int bd = 8; + + if (w == 4) { + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_offset = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + const int16x8_t filter = vld1q_s16(x_filter); + + do { + uint8x16_t t0, t1, t2, t3; + load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3); + + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); + int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); + int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); + int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); + int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); + int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); + + int16x4_t s0 = vget_low_s16(tt0); + int16x4_t s1 = vget_low_s16(tt1); + int16x4_t s2 = vget_low_s16(tt2); + int16x4_t s3 = vget_low_s16(tt3); + int16x4_t s4 = vget_high_s16(tt0); + int16x4_t s5 = vget_high_s16(tt1); + int16x4_t s6 = vget_high_s16(tt2); + int16x4_t s7 = vget_high_s16(tt3); + int16x4_t s8 = vget_low_s16(tt4); + int16x4_t s9 = vget_low_s16(tt5); + int16x4_t s10 = vget_low_s16(tt6); + int16x4_t s11 = vget_low_s16(tt7); + int16x4_t s12 = vget_high_s16(tt4); + int16x4_t s13 = vget_high_s16(tt5); + + int16x4_t d0 = + convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); + int16x4_t d1 = + convolve8_4_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset); + int16x4_t d2 = + convolve8_4_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset); + int16x4_t d3 = convolve8_4_h(s6, s7, s8, s9, s10, s11, s12, s13, filter, + horiz_offset); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The additional -1 is needed because we are halving the filter values. + const int16x8_t horiz_offset = + vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); + // Filter values are all even so halve them to allow convolution + // kernel computations to stay in 16-bit element types. + const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1); + + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, + &t4, &t5, &t6, &t7); + + s += 8; + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + do { + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9, + &t10, &t11, &t12, &t13, &t14, &t15); + + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s15 = vreinterpretq_s16_u16(vmovl_u8(t15)); + + int16x8_t d0 = + convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); + int16x8_t d1 = + convolve8_8_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset); + int16x8_t d2 = convolve8_8_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, + horiz_offset); + int16x8_t d3 = convolve8_8_h(s6, s7, s8, s9, s10, s11, s12, s13, filter, + horiz_offset); + + transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3); + + store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1), + vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0), + vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3)); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s7 = s15; + + s += 8; + d += 4; + width -= 4; + } while (width != 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); + } +} + +static INLINE void convolve_horiz_scale_2_6tap_neon( + const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter) { + const int bd = 8; + + if (w == 4) { + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_offset = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + const int16x8_t filter = vld1q_s16(x_filter); + + do { + uint8x16_t t0, t1, t2, t3; + load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3); + + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); + int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); + int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); + int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); + int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); + int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); + + int16x4_t s0 = vget_low_s16(tt0); + int16x4_t s1 = vget_low_s16(tt1); + int16x4_t s2 = vget_low_s16(tt2); + int16x4_t s3 = vget_high_s16(tt3); + int16x4_t s4 = vget_high_s16(tt0); + int16x4_t s5 = vget_high_s16(tt1); + int16x4_t s6 = vget_high_s16(tt2); + int16x4_t s7 = vget_low_s16(tt4); + int16x4_t s8 = vget_low_s16(tt5); + int16x4_t s9 = vget_low_s16(tt6); + int16x4_t s10 = vget_low_s16(tt7); + int16x4_t s11 = vget_high_s16(tt4); + + int16x4_t d0 = + convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); + int16x4_t d1 = + convolve6_4_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset); + int16x4_t d2 = + convolve6_4_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset); + int16x4_t d3 = + convolve6_4_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The additional -1 is needed because we are halving the filter values. + const int16x8_t horiz_offset = + vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); + // Filter values are all even so halve them to allow convolution + // kernel computations to stay in 16-bit element types. + const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1); + + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, + &t4, &t5, &t6, &t7); + + s += 8; + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + do { + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9, + &t10, &t11, &t12, &t13, &t14, &t15); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); + + int16x8_t d0 = + convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); + int16x8_t d1 = + convolve6_8_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset); + int16x8_t d2 = + convolve6_8_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset); + int16x8_t d3 = + convolve6_8_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset); + + transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3); + + store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1), + vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0), + vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3)); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + + s += 8; + d += 4; + width -= 4; + } while (width != 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); + } +} + +void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (w < 4 || h < 4) { + av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_qn, subpel_y_qn, y_step_qn, conv_params); + return; + } + + // For the interpolation 8-tap filters are used. + assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); + + DECLARE_ALIGNED(32, int16_t, + im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + int im_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; + const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; + + // Horizontal filter + + if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { + if (filter_params_x->interp_filter == MULTITAP_SHARP) { + convolve_horiz_scale_8tap_neon( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); + } else { + convolve_horiz_scale_6tap_neon( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); + } + } else { + assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); + // The filter index is calculated using the + // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS + // equation, where the values of x are from 0 to w. If x_step_qn is a + // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; + + // The source index is calculated using the (subpel_x_qn + x * x_step_qn) + // >> SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If + // subpel_x_qn < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << + // SCALE_SUBPEL_BITS) == 0, the source index can be determined using the + // value x * (x_step_qn / (1 << SCALE_SUBPEL_BITS)). + if (filter_params_x->interp_filter == MULTITAP_SHARP) { + convolve_horiz_scale_2_8tap_neon(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter); + } else { + convolve_horiz_scale_2_6tap_neon(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter); + } + } + + // Vertical filter + if (filter_params_y->interp_filter == MULTITAP_SHARP) { + if (UNLIKELY(conv_params->is_compound)) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + compound_dist_wtd_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); + } else { + compound_avg_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + compound_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, + y_step_qn); + } + } else { + if (UNLIKELY(conv_params->is_compound)) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + compound_dist_wtd_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, dst16, + dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, + subpel_y_qn, y_step_qn); + } else { + compound_avg_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, dst16, + dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, + y_step_qn); + } + } else { + compound_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } +}
diff --git a/av1/common/arm/av1_convolve_scale_neon_dotprod.c b/av1/common/arm/av1_convolve_scale_neon_dotprod.c new file mode 100644 index 0000000..70ae88c --- /dev/null +++ b/av1/common/arm/av1_convolve_scale_neon_dotprod.c
@@ -0,0 +1,427 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <arm_neon.h> +#include <stddef.h> +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/convolve_scale_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/enums.h" +#include "av1/common/filter.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = { + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, + 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +}; +// clang-format on + +static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const int8x8_t filter, + const int32x4_t horiz_const) { + const int8x16_t filters = vcombine_s8(filter, filter); + + uint8x16_t s01 = vcombine_u8(s0, s1); + uint8x16_t s23 = vcombine_u8(s2, s3); + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128))); + int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128))); + + int32x4_t sum01 = vdotq_s32(horiz_const, s01_128, filters); + int32x4_t sum23 = vdotq_s32(horiz_const, s23_128, filters); + + int32x4_t sum = vpaddq_s32(sum01, sum23); + + // We halved the filter values so -1 from right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7, + const int8x8_t filter, + const int32x4_t horiz_const) { + const int8x16_t filters = vcombine_s8(filter, filter); + + uint8x16_t s01 = vcombine_u8(s0, s1); + uint8x16_t s23 = vcombine_u8(s2, s3); + uint8x16_t s45 = vcombine_u8(s4, s5); + uint8x16_t s67 = vcombine_u8(s6, s7); + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128))); + int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128))); + int8x16_t s45_128 = vreinterpretq_s8_u8(vsubq_u8(s45, vdupq_n_u8(128))); + int8x16_t s67_128 = vreinterpretq_s8_u8(vsubq_u8(s67, vdupq_n_u8(128))); + + int32x4_t sum01 = vdotq_s32(horiz_const, s01_128, filters); + int32x4_t sum23 = vdotq_s32(horiz_const, s23_128, filters); + int32x4_t sum45 = vdotq_s32(horiz_const, s45_128, filters); + int32x4_t sum67 = vdotq_s32(horiz_const, s67_128, filters); + + int32x4_t sum0123 = vpaddq_s32(sum01, sum23); + int32x4_t sum4567 = vpaddq_s32(sum45, sum67); + + // We halved the filter values so -1 from right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_horiz_scale_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter, const int subpel_x_qn, + const int x_step_qn) { + DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); + const int bd = 8; + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_offset = + (1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)); + // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 + // from every source value. + const int32_t dotprod_offset = 128 << FILTER_BITS; + // Divide the total by 4: we halved the filter values and will use a pairwise + // add in the convolution kernel. + const int32x4_t horiz_offset_vec = + vdupq_n_s32((horiz_offset + dotprod_offset) >> 2); + + if (w == 4) { + do { + int x_qn = subpel_x_qn; + + // Process a 4x4 tile. + for (int r = 0; r < 4; r++) { + const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + // Filter values are all even so halve them to fit in int8_t. + const int8x8_t filter = + vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + + int16x4_t d0 = convolve8_4_h(t0, t1, t2, t3, filter, horiz_offset_vec); + + vst1_s16(&temp[r * 4], d0); + + x_qn += x_step_qn; + } + + // Transpose the 4x4 result tile and store. + int16x4_t d0, d1, d2, d3; + load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + do { + int x_qn = subpel_x_qn; + int16_t *d = dst; + int width = w; + + do { + // Process an 8x8 tile. + for (int r = 0; r < 8; r++) { + const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + // Filter values are all even so halve them to fit in int8_t. + int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t d0 = convolve8_8_h(t0, t1, t2, t3, t4, t5, t6, t7, filter, + horiz_offset_vec); + + vst1q_s16(&temp[r * 8], d0); + + x_qn += x_step_qn; + } + + // Transpose the 8x8 result tile and store. + int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + d += 8; + width -= 8; + } while (width != 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); + } +} + +static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t horiz_const, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } + // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + int32x4_t sum = vdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // We halved the filter values so -1 from right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2], + const int8x8_t filters, + const int32x4_t horiz_const, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples0_128 = + vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))); + int8x16_t samples1_128 = + vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } + // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } + int8x16_t perm_samples[4] = { vqtbl1q_s8(samples0_128, permute_tbl.val[0]), + vqtbl1q_s8(samples0_128, permute_tbl.val[1]), + vqtbl1q_s8(samples1_128, permute_tbl.val[0]), + vqtbl1q_s8(samples1_128, permute_tbl.val[1]) }; + + // First 4 output values. + int32x4_t sum0123 = vdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); + sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum4567 = vdotq_lane_s32(horiz_const, perm_samples[2], filters, 0); + sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filters, 1); + + // We halved the filter values so -1 from right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_horiz_scale_2_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter) { + const int bd = 8; + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_offset = + (1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)); + // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 + // from every source value. + const int32_t dotprod_offset = 128 << FILTER_BITS; + // Divide the total by 2 because we halved the filter values. + const int32x4_t horiz_offset_vec = + vdupq_n_s32((horiz_offset + dotprod_offset) >> 1); + + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl); + // Filter values are all even so halve them to fit in int8_t. + const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter), 1); + + if (w == 4) { + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve8_4_h_scale_2(s0, filter, horiz_offset_vec, permute_tbl); + int16x4_t d1 = + convolve8_4_h_scale_2(s1, filter, horiz_offset_vec, permute_tbl); + int16x4_t d2 = + convolve8_4_h_scale_2(s2, filter, horiz_offset_vec, permute_tbl); + int16x4_t d3 = + convolve8_4_h_scale_2(s3, filter, horiz_offset_vec, permute_tbl); + + store_s16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 4; + width -= 4; + } while (width != 0); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = + convolve8_8_h_scale_2(s0, filter, horiz_offset_vec, permute_tbl); + int16x8_t d1 = + convolve8_8_h_scale_2(s1, filter, horiz_offset_vec, permute_tbl); + int16x8_t d2 = + convolve8_8_h_scale_2(s2, filter, horiz_offset_vec, permute_tbl); + int16x8_t d3 = + convolve8_8_h_scale_2(s3, filter, horiz_offset_vec, permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 16; + d += 8; + width -= 8; + } while (width != 0); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } +} + +void av1_convolve_2d_scale_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (w < 4 || h < 4) { + av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_qn, subpel_y_qn, y_step_qn, conv_params); + return; + } + + // For the interpolation 8-tap filters are used. + assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); + + DECLARE_ALIGNED(32, int16_t, + im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + int im_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; + const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; + + // Horizontal filter + if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { + convolve_horiz_scale_neon_dotprod( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); + } else { + assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); + // The filter index is calculated using the + // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS + // equation, where the values of x are from 0 to w. If x_step_qn is a + // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; + + // The source index is calculated using the (subpel_x_qn + x * x_step_qn) >> + // SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If subpel_x_qn + // < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << SCALE_SUBPEL_BITS) == 0, + // the source index can be determined using the value x * (x_step_qn / + // (1 << SCALE_SUBPEL_BITS)). + convolve_horiz_scale_2_neon_dotprod(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, + im_h, x_filter); + } + + // Vertical filter + if (filter_params_y->interp_filter == MULTITAP_SHARP) { + if (UNLIKELY(conv_params->is_compound)) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + compound_dist_wtd_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); + } else { + compound_avg_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + compound_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, + y_step_qn); + } + } else { + if (UNLIKELY(conv_params->is_compound)) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + compound_dist_wtd_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, dst16, + dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, + subpel_y_qn, y_step_qn); + } else { + compound_avg_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, dst16, + dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, + y_step_qn); + } + } else { + compound_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } +}
diff --git a/av1/common/arm/av1_convolve_scale_neon_i8mm.c b/av1/common/arm/av1_convolve_scale_neon_i8mm.c new file mode 100644 index 0000000..fe94c84 --- /dev/null +++ b/av1/common/arm/av1_convolve_scale_neon_i8mm.c
@@ -0,0 +1,403 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <arm_neon.h> +#include <stddef.h> +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/convolve_scale_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/enums.h" +#include "av1/common/filter.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = { + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, + 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +}; +// clang-format on + +static INLINE int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const int8x8_t filter, + const int32x4_t horiz_const) { + const int8x16_t filters = vcombine_s8(filter, filter); + + uint8x16_t s01 = vcombine_u8(s0, s1); + uint8x16_t s23 = vcombine_u8(s2, s3); + + int32x4_t sum01 = vusdotq_s32(horiz_const, s01, filters); + int32x4_t sum23 = vusdotq_s32(horiz_const, s23, filters); + + int32x4_t sum = vpaddq_s32(sum01, sum23); + + // We halved the filter values so -1 from right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7, + const int8x8_t filter, + const int32x4_t horiz_const) { + const int8x16_t filters = vcombine_s8(filter, filter); + + uint8x16_t s01 = vcombine_u8(s0, s1); + uint8x16_t s23 = vcombine_u8(s2, s3); + uint8x16_t s45 = vcombine_u8(s4, s5); + uint8x16_t s67 = vcombine_u8(s6, s7); + + int32x4_t sum01 = vusdotq_s32(horiz_const, s01, filters); + int32x4_t sum23 = vusdotq_s32(horiz_const, s23, filters); + int32x4_t sum45 = vusdotq_s32(horiz_const, s45, filters); + int32x4_t sum67 = vusdotq_s32(horiz_const, s67, filters); + + int32x4_t sum0123 = vpaddq_s32(sum01, sum23); + int32x4_t sum4567 = vpaddq_s32(sum45, sum67); + + // We halved the filter values so -1 from right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_horiz_scale_neon_i8mm(const uint8_t *src, + int src_stride, int16_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter, + const int subpel_x_qn, + const int x_step_qn) { + DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); + const int bd = 8; + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Divide the total by 4: we halved the filter values and will use a pairwise + // add in the convolution kernel. + const int32x4_t horiz_offset = vdupq_n_s32( + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) >> 2); + + if (w == 4) { + do { + int x_qn = subpel_x_qn; + + // Process a 4x4 tile. + for (int r = 0; r < 4; r++) { + const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + // Filter values are all even so halve them to fit in int8_t. + const int8x8_t filter = + vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + + int16x4_t d0 = convolve8_4_h(t0, t1, t2, t3, filter, horiz_offset); + + vst1_s16(&temp[r * 4], d0); + x_qn += x_step_qn; + } + + // Transpose the 4x4 result tile and store. + int16x4_t d0, d1, d2, d3; + load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + do { + int x_qn = subpel_x_qn; + int16_t *d = dst; + int width = w; + + do { + // Process an 8x8 tile. + for (int r = 0; r < 8; r++) { + const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + // Filter values are all even so halve them to fit in int8_t. + const int8x8_t filter = + vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t d0 = convolve8_8_h(t0, t1, t2, t3, t4, t5, t6, t7, filter, + horiz_offset); + + vst1q_s16(&temp[r * 8], d0); + + x_qn += x_step_qn; + } + + // Transpose the 8x8 result tile and store. + int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + d += 8; + width -= 8; + } while (width != 0); + + dst += 8 * dst_stride; + src += 8 * src_stride; + h -= 8; + } while (h > 0); + } +} + +static INLINE int16x4_t convolve8_4_h_scale_2(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t horiz_const, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } + // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } + uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // We halved the filter values so -1 from right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2], + const int8x8_t filters, + const int32x4_t horiz_const, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } + // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } + uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), + vqtbl1q_u8(samples[0], permute_tbl.val[1]), + vqtbl1q_u8(samples[1], permute_tbl.val[0]), + vqtbl1q_u8(samples[1], permute_tbl.val[1]) }; + + // First 4 output values. + int32x4_t sum0123 = + vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); + sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1); + + // Second 4 output values. + int32x4_t sum4567 = + vusdotq_lane_s32(horiz_const, perm_samples[2], filters, 0); + sum4567 = vusdotq_lane_s32(sum4567, perm_samples[3], filters, 1); + + // We halved the filter values so -1 from right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_horiz_scale_2_neon_i8mm( + const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter) { + const int bd = 8; + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The additional -1 is needed because we are halving the filter values. + const int32x4_t horiz_offset = + vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); + + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl); + // Filter values are all even so halve them to fit in int8_t. + const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter), 1); + + if (w == 4) { + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve8_4_h_scale_2(s0, filter, horiz_offset, permute_tbl); + int16x4_t d1 = + convolve8_4_h_scale_2(s1, filter, horiz_offset, permute_tbl); + int16x4_t d2 = + convolve8_4_h_scale_2(s2, filter, horiz_offset, permute_tbl); + int16x4_t d3 = + convolve8_4_h_scale_2(s3, filter, horiz_offset, permute_tbl); + + store_s16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 4; + width -= 4; + } while (width != 0); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = + convolve8_8_h_scale_2(s0, filter, horiz_offset, permute_tbl); + int16x8_t d1 = + convolve8_8_h_scale_2(s1, filter, horiz_offset, permute_tbl); + int16x8_t d2 = + convolve8_8_h_scale_2(s2, filter, horiz_offset, permute_tbl); + int16x8_t d3 = + convolve8_8_h_scale_2(s3, filter, horiz_offset, permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 16; + d += 8; + width -= 8; + } while (width != 0); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } +} + +void av1_convolve_2d_scale_neon_i8mm(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (w < 4 || h < 4) { + av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_qn, subpel_y_qn, y_step_qn, conv_params); + return; + } + + // For the interpolation 8-tap filters are used. + assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); + + DECLARE_ALIGNED(32, int16_t, + im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + int im_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; + const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; + + // Horizontal filter + if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { + convolve_horiz_scale_neon_i8mm( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); + } else { + assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); + // The filter index is calculated using the + // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS + // equation, where the values of x are from 0 to w. If x_step_qn is a + // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; + + // The source index is calculated using the (subpel_x_qn + x * x_step_qn) >> + // SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If subpel_x_qn + // < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << SCALE_SUBPEL_BITS) == 0, + // the source index can be determined using the value x * (x_step_qn / + // (1 << SCALE_SUBPEL_BITS)). + convolve_horiz_scale_2_neon_i8mm(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter); + } + + // Vertical filter + if (filter_params_y->interp_filter == MULTITAP_SHARP) { + if (UNLIKELY(conv_params->is_compound)) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + compound_dist_wtd_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); + } else { + compound_avg_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + compound_convolve_vert_scale_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, + y_step_qn); + } + } else { + if (UNLIKELY(conv_params->is_compound)) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + compound_dist_wtd_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, dst16, + dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, + subpel_y_qn, y_step_qn); + } else { + compound_avg_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, dst16, + dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, + y_step_qn); + } + } else { + compound_convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst16, dst16_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } else { + convolve_vert_scale_6tap_neon( + im_block + im_stride, im_stride, dst, dst_stride, w, h, + filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); + } + } +}
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c index 09e5166..f15d473 100644 --- a/av1/common/arm/av1_inv_txfm_neon.c +++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -447,7 +447,7 @@ out[7] = step1; } -void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { +static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { assert(!(size % 4)); if (!bit) return; const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); @@ -3661,7 +3661,7 @@ round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); } row_txfm(cur_a, cur_a, INV_COS_BIT); - av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); + round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); if (lr_flip == 1) { for (int j = 0; j < buf_size_w_div8; ++j) { flip_buf_ud_neon(&cur_a[j * 8], 8); @@ -3736,8 +3736,7 @@ } for (int j = 0; j < buf_size_w_div8; ++j) { col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); - av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, - -shift[1]); + round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { @@ -3814,8 +3813,9 @@ } } -void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, int eob) { +static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + int eob) { (void)eob; TX_SIZE tx_size = TX_4X8; DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); @@ -3879,8 +3879,9 @@ } } -void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, int eob) { +static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + int eob) { (void)eob; TX_SIZE tx_size = TX_8X4; DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); @@ -3944,8 +3945,9 @@ } } -void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, int eob) { +static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_4X16; DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); @@ -4008,8 +4010,9 @@ } } -void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, int eob) { +static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_16X4; DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); @@ -4112,7 +4115,7 @@ round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); } row_txfm(cur_a, cur_a, INV_COS_BIT); - av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); + round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); if (lr_flip == 1) { for (int j = 0; j < buf_size_w_div8; ++j) { flip_buf_ud_neon(&cur_a[j * 8], 8); @@ -4130,8 +4133,7 @@ } for (int j = 0; j < buf_size_w_div8; ++j) { col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); - av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, - -shift[1]); + round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) {
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c index 22d2977..7afb1a9 100644 --- a/av1/common/arm/blend_a64_hmask_neon.c +++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -73,7 +73,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -88,7 +88,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); - store_unaligned_u8_2x2(dst, dst_stride, blend); + store_u8x2_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride;
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c index d53d363..9aea299 100644 --- a/av1/common/arm/blend_a64_vmask_neon.c +++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -78,7 +78,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; @@ -97,7 +97,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); - store_unaligned_u8_2x2(dst, dst_stride, blend); + store_u8x2_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride;
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c index 4465e0b..53d3a9f 100644 --- a/av1/common/arm/cdef_block_neon.c +++ b/av1/common/arm/cdef_block_neon.c
@@ -9,124 +9,69 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "aom_dsp/aom_simd.h" -#include "aom_dsp/arm/mem_neon.h" +#include <arm_neon.h> +#include <assert.h> -#define SIMD_FUNC(name) name##_neon -#include "av1/common/cdef_block_simd.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/cdef_block.h" void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { - int j; - for (int i = 0; i < height; i++) { - for (j = 0; j < (width & ~0x7); j += 8) { - v64 row = v64_load_unaligned(&src[i * sstride + j]); - v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + do { + const uint8_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 16) { + uint8x16_t row = vld1q_u8(src_ptr + w); + uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } }; + vst2q_u8((uint8_t *)(dst_ptr + w), row_u16); + + w += 16; } - for (; j < width; j++) { - dst[i * dstride + j] = src[i * sstride + j]; + if (width - w >= 8) { + uint8x8_t row = vld1_u8(src_ptr + w); + vst1q_u16(dst_ptr + w, vmovl_u8(row)); + w += 8; } - } + if (width - w == 4) { + for (int i = w; i < w + 4; i++) { + dst_ptr[i] = src_ptr[i]; + } + } + + src += sstride; + dst += dstride; + } while (--height != 0); } -static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) { - return vreinterpretq_s16_s64(vcombine_s64(vcreate_s64(a), vcreate_s64(b))); -} +void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; -#define SHL_HIGH_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - 0, vget_lane_u64(vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \ - (n - 8) * 8), \ - 0)); \ - } + int w = 0; + while (width - w >= 8) { + uint16x8_t row = vld1q_u16(src_ptr + w); + vst1q_u16(dst_ptr + w, row); -#define SHL_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - 0, vget_lane_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), 0)); \ - } + w += 8; + } + if (width - w == 4) { + uint16x4_t row = vld1_u16(src_ptr + w); + vst1_u16(dst_ptr + w, row); + } -#define SHL_LOW_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64( \ - vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), 0), \ - vget_lane_u64( \ - vorr_u64( \ - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \ - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \ - (8 - n) * 8)), \ - 0)); \ - } - -SHL_HIGH_NEON(14) -SHL_HIGH_NEON(12) -SHL_HIGH_NEON(10) -SHL_NEON(8) -SHL_LOW_NEON(6) -SHL_LOW_NEON(4) -SHL_LOW_NEON(2) - -#define v128_shl_n_byte_neon(a, n) v128_shl_##n##_byte_neon(a) - -#define SHR_HIGH_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64(vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \ - (n - 8) * 8), \ - 0), \ - 0); \ - } - -#define SHR_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), 0), 0); \ - } - -#define SHR_LOW_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64( \ - vorr_u64( \ - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), \ - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \ - (8 - n) * 8)), \ - 0), \ - vget_lane_u64( \ - vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \ - 0)); \ - } - -SHR_HIGH_NEON(14) -SHR_HIGH_NEON(12) -SHR_HIGH_NEON(10) -SHR_NEON(8) -SHR_LOW_NEON(6) -SHR_LOW_NEON(4) -SHR_LOW_NEON(2) - -#define v128_shr_n_byte_neon(a, n) v128_shr_##n##_byte_neon(a) - -static INLINE uint32x4_t v128_madd_s16_neon(int16x8_t a, int16x8_t b) { - uint32x4_t t1 = - vreinterpretq_u32_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b))); - uint32x4_t t2 = - vreinterpretq_u32_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b))); -#if AOM_ARCH_AARCH64 - return vpaddq_u32(t1, t2); -#else - return vcombine_u32(vpadd_u32(vget_low_u32(t1), vget_high_u32(t1)), - vpadd_u32(vget_low_u32(t2), vget_high_u32(t2))); -#endif + src += sstride; + dst += dstride; + } while (--height != 0); } // partial A is a 16-bit vector of the form: @@ -139,8 +84,8 @@ int16x8_t partialb, uint32x4_t const1, uint32x4_t const2) { - int16x8_t tmp; // Reverse partial B. + // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. uint8x16_t pattern = vreinterpretq_u8_u64( vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); @@ -156,98 +101,100 @@ partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif - // Interleave the x and y values of identical indices and pair x8 with 0. - tmp = partiala; - partiala = vzipq_s16(partiala, partialb).val[0]; - partialb = vzipq_s16(tmp, partialb).val[1]; // Square and add the corresponding x and y values. - uint32x4_t partiala_u32 = v128_madd_s16_neon(partiala, partiala); - uint32x4_t partialb_u32 = v128_madd_s16_neon(partialb, partialb); + int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); + cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); + int32x4_t cost_hi = + vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); + cost_hi = + vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); // Multiply by constant. - partiala_u32 = vmulq_u32(partiala_u32, const1); - partialb_u32 = vmulq_u32(partialb_u32, const2); - - // Sum all results. - partiala_u32 = vaddq_u32(partiala_u32, partialb_u32); - return partiala_u32; + uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); + return cost; } -static INLINE uint64x2_t ziplo_u64(uint32x4_t a, uint32x4_t b) { - return vcombine_u64(vget_low_u64(vreinterpretq_u64_u32(a)), - vget_low_u64(vreinterpretq_u64_u32(b))); -} - -static INLINE uint64x2_t ziphi_u64(uint32x4_t a, uint32x4_t b) { - return vcombine_u64(vget_high_u64(vreinterpretq_u64_u32(a)), - vget_high_u64(vreinterpretq_u64_u32(b))); -} - -static INLINE uint32x4_t hsum4_neon(uint32x4_t x0, uint32x4_t x1, uint32x4_t x2, - uint32x4_t x3) { - uint32x4_t t0, t1, t2, t3; - t0 = vzipq_u32(x0, x1).val[0]; - t1 = vzipq_u32(x2, x3).val[0]; - t2 = vzipq_u32(x0, x1).val[1]; - t3 = vzipq_u32(x2, x3).val[1]; - x0 = vreinterpretq_u32_u64(ziplo_u64(t0, t1)); - x1 = vreinterpretq_u32_u64(ziphi_u64(t0, t1)); - x2 = vreinterpretq_u32_u64(ziplo_u64(t2, t3)); - x3 = vreinterpretq_u32_u64(ziphi_u64(t2, t3)); - return vaddq_u32(vaddq_u32(x0, x1), vaddq_u32(x2, x3)); -} - -static INLINE uint32x4_t compute_directions_neon(int16x8_t lines[8], - uint32_t cost[4]) { - int16x8_t partial4a, partial4b, partial5a, partial5b, partial6, partial7a, - partial7b; - int16x8_t tmp; +// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal +// down-right, 6 is vertical). +// +// For each direction the lines are shifted so that we can perform a +// basic sum on each vector element. For example, direction 5 is "south by +// southeast", so we need to add the pixels along each line i below: +// +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// For this to fit nicely in vectors, the lines need to be shifted like so: +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// In this configuration we can now perform SIMD additions to get the cost +// along direction 5. Since this won't fit into a single 128-bit vector, we use +// two of them to compute each half of the new configuration, and pad the empty +// spaces with zeros. Similar shifting is done for other directions, except +// direction 6 which is straightforward as it's the vertical direction. +static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); // Partial sums for lines 0 and 1. - partial4a = v128_shl_n_byte_neon(lines[0], 14); - partial4b = v128_shr_n_byte_neon(lines[0], 2); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[1], 12)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[1], 4)); - tmp = vaddq_s16(lines[0], lines[1]); - partial5a = v128_shl_n_byte_neon(tmp, 10); - partial5b = v128_shr_n_byte_neon(tmp, 6); - partial7a = v128_shl_n_byte_neon(tmp, 4); - partial7b = v128_shr_n_byte_neon(tmp, 12); - partial6 = tmp; + int16x8_t partial4a = vextq_s16(zero, lines[0], 1); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); + int16x8_t partial4b = vextq_s16(lines[0], zero, 1); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); + int16x8_t tmp = vaddq_s16(lines[0], lines[1]); + int16x8_t partial5a = vextq_s16(zero, tmp, 3); + int16x8_t partial5b = vextq_s16(tmp, zero, 3); + int16x8_t partial7a = vextq_s16(zero, tmp, 6); + int16x8_t partial7b = vextq_s16(tmp, zero, 6); + int16x8_t partial6 = tmp; // Partial sums for lines 2 and 3. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[2], 10)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[2], 6)); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[3], 8)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[3], 8)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); tmp = vaddq_s16(lines[2], lines[3]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 8)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 8)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 6)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 10)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 4 and 5. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[4], 6)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[4], 10)); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[5], 4)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[5], 12)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); tmp = vaddq_s16(lines[4], lines[5]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 6)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 10)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 8)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 8)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 6 and 7. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[6], 2)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[6], 14)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); partial4a = vaddq_s16(partial4a, lines[7]); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); tmp = vaddq_s16(lines[6], lines[7]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 4)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 12)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 10)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 6)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); partial6 = vaddq_s16(partial6, tmp); uint32x4_t const0 = vreinterpretq_u32_u64( @@ -263,74 +210,173 @@ vcreate_u64((uint64_t)105 << 32 | 105))); // Compute costs in terms of partial sums. - uint32x4_t partial4a_u32 = - fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); - uint32x4_t partial7a_u32 = - fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); - uint32x4_t partial5a_u32 = - fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); - uint32x4_t partial6_u32 = v128_madd_s16_neon(partial6, partial6); - partial6_u32 = vmulq_u32(partial6_u32, vdupq_n_u32(105)); + int32x4_t partial6_s32 = + vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); + partial6_s32 = + vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); - partial4a_u32 = - hsum4_neon(partial4a_u32, partial5a_u32, partial6_u32, partial7a_u32); - vst1q_u32(cost, partial4a_u32); - return partial4a_u32; + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); + costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); + costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); + costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; } -static INLINE int64x2_t ziplo_s64(int32x4_t a, int32x4_t b) { - return vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(a)), - vget_low_s64(vreinterpretq_s64_s32(b))); -} +static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala, + int16x8_t partialb, + int16x8_t partialc, + uint32x4_t const0) { + // Reverse partial c. + // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }. + uint8x16_t pattern = vreinterpretq_u8_u64( + vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a), + vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302))); -static INLINE int64x2_t ziphi_s64(int32x4_t a, int32x4_t b) { - return vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(a)), - vget_high_s64(vreinterpretq_s64_s32(b))); -} - -// Transpose and reverse the order of the lines -- equivalent to a 90-degree -// counter-clockwise rotation of the pixels. -static INLINE void array_reverse_transpose_8x8_neon(int16x8_t *in, - int16x8_t *res) { - const int32x4_t tr0_0 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[0]); - const int32x4_t tr0_1 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[0]); - const int32x4_t tr0_2 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[1]); - const int32x4_t tr0_3 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[1]); - const int32x4_t tr0_4 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[0]); - const int32x4_t tr0_5 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[0]); - const int32x4_t tr0_6 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[1]); - const int32x4_t tr0_7 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[1]); - - const int32x4_t tr1_0 = vzipq_s32(tr0_0, tr0_1).val[0]; - const int32x4_t tr1_1 = vzipq_s32(tr0_4, tr0_5).val[0]; - const int32x4_t tr1_2 = vzipq_s32(tr0_0, tr0_1).val[1]; - const int32x4_t tr1_3 = vzipq_s32(tr0_4, tr0_5).val[1]; - const int32x4_t tr1_4 = vzipq_s32(tr0_2, tr0_3).val[0]; - const int32x4_t tr1_5 = vzipq_s32(tr0_6, tr0_7).val[0]; - const int32x4_t tr1_6 = vzipq_s32(tr0_2, tr0_3).val[1]; - const int32x4_t tr1_7 = vzipq_s32(tr0_6, tr0_7).val[1]; - - res[7] = vreinterpretq_s16_s64(ziplo_s64(tr1_0, tr1_1)); - res[6] = vreinterpretq_s16_s64(ziphi_s64(tr1_0, tr1_1)); - res[5] = vreinterpretq_s16_s64(ziplo_s64(tr1_2, tr1_3)); - res[4] = vreinterpretq_s16_s64(ziphi_s64(tr1_2, tr1_3)); - res[3] = vreinterpretq_s16_s64(ziplo_s64(tr1_4, tr1_5)); - res[2] = vreinterpretq_s16_s64(ziphi_s64(tr1_4, tr1_5)); - res[1] = vreinterpretq_s16_s64(ziplo_s64(tr1_6, tr1_7)); - res[0] = vreinterpretq_s16_s64(ziphi_s64(tr1_6, tr1_7)); -} - -static INLINE uint32_t compute_best_dir(uint8x16_t a) { - uint8x16_t idx = - vandq_u8(a, vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); #if AOM_ARCH_AARCH64 - return vaddv_u8(vget_low_u8(idx)) + (vaddv_u8(vget_high_u8(idx)) << 8); + partialc = + vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern)); #else - uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(idx))); - uint8x16_t s = vreinterpretq_u8_u64(m); - return vget_lane_u32( - vreinterpret_u32_u8(vzip_u8(vget_low_u8(s), vget_high_u8(s)).val[0]), 0); + int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)), + vget_high_s8(vreinterpretq_s8_s16(partialc)) } }; + int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); + int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); + partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif + + int32x4_t partiala_s32 = vpaddlq_s16(partiala); + int32x4_t partialb_s32 = vpaddlq_s16(partialb); + int32x4_t partialc_s32 = vpaddlq_s16(partialc); + + partiala_s32 = vmulq_s32(partiala_s32, partiala_s32); + partialb_s32 = vmulq_s32(partialb_s32, partialb_s32); + partialc_s32 = vmulq_s32(partialc_s32, partialc_s32); + + partiala_s32 = vaddq_s32(partiala_s32, partialc_s32); + + uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0); + return cost; +} + +// This function computes the cost along directions 0, 1, 2, 3. (0 means +// 45-degree up-right, 2 is horizontal). +// +// For direction 1 and 3 ("east northeast" and "east southeast") the shifted +// lines need three vectors instead of two. For direction 1 for example, we need +// to compute the sums along the line i below: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Which means we need the following configuration: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Three vectors are needed to compute this, as well as some extra pairwise +// additions. +static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); + + // Compute diagonal directions (1, 2, 3). + // Partial sums for lines 0 and 1. + int16x8_t partial0a = lines[0]; + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7)); + int16x8_t partial0b = vextq_s16(lines[1], zero, 7); + int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6)); + int16x8_t partial1b = vextq_s16(lines[1], zero, 6); + int16x8_t partial3a = vextq_s16(lines[0], zero, 2); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4)); + int16x8_t partial3b = vextq_s16(zero, lines[0], 2); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4)); + + // Partial sums for lines 2 and 3. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2)); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6)); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6)); + partial3b = vaddq_s16(partial3b, lines[3]); + + // Partial sums for lines 4 and 5. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3)); + partial1b = vaddq_s16(partial1b, lines[4]); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6)); + int16x8_t partial1c = vextq_s16(lines[5], zero, 6); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4)); + int16x8_t partial3c = vextq_s16(zero, lines[4], 2); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4)); + + // Partial sums for lines 6 and 7. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6)); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6)); + partial3c = vaddq_s16(partial3c, lines[7]); + + // Special case for direction 2 as it's just a sum along each line. + int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] }; + int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] }; + int32x4_t partial2a = horizontal_add_4d_s16x8(lines03); + int32x4_t partial2b = horizontal_add_4d_s16x8(lines47); + + uint32x4_t partial2a_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a)); + uint32x4_t partial2b_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b)); + + uint32x4_t const0 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), + vcreate_u64((uint64_t)210 << 32 | 280))); + uint32x4_t const1 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), + vcreate_u64((uint64_t)105 << 32 | 120))); + uint32x4_t const2 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420), + vcreate_u64((uint64_t)105 << 32 | 140))); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1); + costs[1] = + fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2); + costs[2] = vaddq_u32(partial2a_u32, partial2b_u32); + costs[2] = vmulq_n_u32(costs[2], 105); + costs[3] = + fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; } int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, @@ -346,22 +392,40 @@ } // Compute "mostly vertical" directions. - uint32x4_t cost47 = compute_directions_neon(lines, cost + 4); - - array_reverse_transpose_8x8_neon(lines, lines); + uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4); // Compute "mostly horizontal" directions. - uint32x4_t cost03 = compute_directions_neon(lines, cost); + uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost); - uint32x4_t max_cost = vmaxq_u32(cost03, cost47); - max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 2)); - max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 1)); - best_cost = vgetq_lane_u32(max_cost, 0); - uint16x8_t idx = vcombine_u16(vqmovn_u32(vceqq_u32(max_cost, cost03)), - vqmovn_u32(vceqq_u32(max_cost, cost47))); - uint8x16_t idx_u8 = vcombine_u8(vqmovn_u16(idx), vqmovn_u16(idx)); - best_dir = compute_best_dir(idx_u8); - best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros + // Find max cost as well as its index to get best_dir. + // The max cost needs to be propagated in the whole vector to find its + // position in the original cost vectors cost03 and cost47. + uint32x4_t cost07 = vmaxq_u32(cost03, cost47); +#if AOM_ARCH_AARCH64 + best_cost = vmaxvq_u32(cost07); + uint32x4_t max_cost = vdupq_n_u32(best_cost); + uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)), + vreinterpretq_u8_u32( + vceqq_u32(max_cost, cost47)) } }; + // idx = { 28, 24, 20, 16, 12, 8, 4, 0 }; + uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); + // Get the lowest 8 bit of each 32-bit elements and reverse them. + uint8x8_t tbl = vqtbl2_u8(costs, idx); + uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0); + best_dir = aom_clzll(a) >> 3; +#else + uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07)); + cost64 = vpmax_u32(cost64, cost64); + uint32x4_t max_cost = vcombine_u32(cost64, cost64); + best_cost = vget_lane_u32(cost64, 0); + uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)), + vmovn_u32(vceqq_u32(max_cost, cost47))); + uint8x8_t idx = + vand_u8(vmovn_u16(costs), + vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); + int sum = horizontal_add_u8x8(idx); + best_dir = get_msb(sum ^ (sum - 1)); +#endif // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. @@ -386,408 +450,274 @@ // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b, unsigned int threshold, int adjdamp) { - int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, b)); - const int16x8_t sign = vshrq_n_s16(diff, 15); - diff = vabsq_s16(diff); - const uint16x8_t s = - vqsubq_u16(vdupq_n_u16(threshold), - vreinterpretq_u16_s16(vshlq_s16(diff, vdupq_n_s16(-adjdamp)))); - return veorq_s16(vaddq_s16(sign, vminq_s16(diff, vreinterpretq_s16_u16(s))), - sign); + uint16x8_t diff = vabdq_u16(a, b); + const uint16x8_t a_gt_b = vcgtq_u16(a, b); + const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold), + vshlq_u16(diff, vdupq_n_s16(-adjdamp))); + const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s)); + return vbslq_s16(a_gt_b, clip, vnegq_s16(clip)); } -static INLINE uint16x8_t get_max_primary(const int is_lowbd, uint16x8_t *tap, - uint16x8_t max, - uint16x8_t cdef_large_value_mask) { - if (is_lowbd) { - uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3])); - /* The source is 16 bits, however, we only really care about the lower - 8 bits. The upper 8 bits contain the "large" flag. After the final - primary max has been calculated, zero out the upper 8 bits. Use this - to find the "16 bit" max. */ - max = vmaxq_u16( - max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask)); - } else { - /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ - max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask)); - } - return max; +static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4], + const int *pri_taps, int pri_strength, + int pri_damping, int16x8_t *sum) { + // Near taps + int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping); + int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping); + // sum += pri_taps[0] * (n0 + n1) + n0 = vaddq_s16(n0, n1); + *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]); + + // Far taps + int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping); + int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping); + // sum += pri_taps[1] * (f0 + f1) + f0 = vaddq_s16(f0, f1); + *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]); } -static INLINE uint16x8_t get_max_secondary(const int is_lowbd, uint16x8_t *tap, - uint16x8_t max, - uint16x8_t cdef_large_value_mask) { - if (is_lowbd) { - uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[4])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[5])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[6])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[7])); - /* The source is 16 bits, however, we only really care about the lower - 8 bits. The upper 8 bits contain the "large" flag. After the final - primary max has been calculated, zero out the upper 8 bits. Use this - to find the "16 bit" max. */ - max = vmaxq_u16( - max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask)); - } else { - /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ - max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[4], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[5], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[6], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[7], cdef_large_value_mask)); - } - return max; -} +static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8], + const int *sec_taps, int sec_strength, + int sec_damping, int16x8_t *sum) { + // Near taps + int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping); + int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping); + int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping); + int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping); -static INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int pri_strength, - int sec_strength, int dir, int pri_damping, - int sec_damping, int coeff_shift, - int height, int enable_primary, - int enable_secondary) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; - const int clipping_required = enable_primary && enable_secondary; - uint16x8_t max, min; - const uint16x8_t cdef_large_value_mask = - vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); - const int po1 = cdef_directions[dir][0]; - const int po2 = cdef_directions[dir][1]; - const int s1o1 = cdef_directions[dir + 2][0]; - const int s1o2 = cdef_directions[dir + 2][1]; - const int s2o1 = cdef_directions[dir - 2][0]; - const int s2o2 = cdef_directions[dir - 2][1]; - const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps; + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]); - if (enable_primary && pri_strength) { - pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); - } - if (enable_secondary && sec_strength) { - sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); - } + // Far taps + s0 = constrain16(tap[4], s, sec_strength, sec_damping); + s1 = constrain16(tap[5], s, sec_strength, sec_damping); + s2 = constrain16(tap[6], s, sec_strength, sec_damping); + s3 = constrain16(tap[7], s, sec_strength, sec_damping); - int h = height; - do { - int16x8_t sum = vdupq_n_s16(0); - uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); - max = min = s; - - if (enable_primary) { - uint16x8_t tap[4]; - - // Primary near taps - tap[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); - tap[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); - int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping); - int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping); - - // sum += pri_taps[0] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0])); - - // Primary far taps - tap[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); - tap[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); - p0 = constrain16(tap[2], s, pri_strength, pri_damping); - p1 = constrain16(tap[3], s, pri_strength, pri_damping); - - // sum += pri_taps[1] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1])); - - if (clipping_required) { - max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - } - } - - if (enable_secondary) { - uint16x8_t tap[8]; - - // Secondary near taps - tap[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); - tap[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); - tap[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); - tap[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); - int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping); - int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping); - int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping); - int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping); - - // sum += sec_taps[0] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0])); - - // Secondary far taps - tap[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); - tap[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); - tap[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); - tap[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); - p0 = constrain16(tap[4], s, sec_strength, sec_damping); - p1 = constrain16(tap[5], s, sec_strength, sec_damping); - p2 = constrain16(tap[6], s, sec_strength, sec_damping); - p3 = constrain16(tap[7], s, sec_strength, sec_damping); - - // sum += sec_taps[1] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1])); - - if (clipping_required) { - max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - min = vminq_u16(min, tap[4]); - min = vminq_u16(min, tap[5]); - min = vminq_u16(min, tap[6]); - min = vminq_u16(min, tap[7]); - } - } - - // res = row + ((sum - (sum < 0) + 8) >> 4) - sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); - int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8)); - res = vshrq_n_s16(res, 4); - res = vaddq_s16(vreinterpretq_s16_u16(s), res); - - if (clipping_required) { - res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), - vreinterpretq_s16_u16(max)); - } - - if (is_lowbd) { - const uint8x8_t res_128 = vqmovun_s16(res); - store_unaligned_u8_4x2(dst8, dstride, res_128); - } else { - store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res)); - } - - in += 2 * CDEF_BSTRIDE; - dst8 += 2 * dstride; - dst16 += 2 * dstride; - h -= 2; - } while (h != 0); -} - -static INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int pri_strength, - int sec_strength, int dir, int pri_damping, - int sec_damping, int coeff_shift, - int height, int enable_primary, - int enable_secondary) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; - const int clipping_required = enable_primary && enable_secondary; - uint16x8_t max, min; - const uint16x8_t cdef_large_value_mask = - vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); - const int po1 = cdef_directions[dir][0]; - const int po2 = cdef_directions[dir][1]; - const int s1o1 = cdef_directions[dir + 2][0]; - const int s1o2 = cdef_directions[dir + 2][1]; - const int s2o1 = cdef_directions[dir - 2][0]; - const int s2o2 = cdef_directions[dir - 2][1]; - const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps; - - if (enable_primary && pri_strength) { - pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); - } - if (enable_secondary && sec_strength) { - sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); - } - - int h = height; - do { - int16x8_t sum = vdupq_n_s16(0); - uint16x8_t s = vld1q_u16(in); - max = min = s; - - if (enable_primary) { - uint16x8_t tap[4]; - - // Primary near taps - tap[0] = vld1q_u16(in + po1); - tap[1] = vld1q_u16(in - po1); - int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping); - int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping); - - // sum += pri_taps[0] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0])); - - // Primary far taps - tap[2] = vld1q_u16(in + po2); - p0 = constrain16(tap[2], s, pri_strength, pri_damping); - tap[3] = vld1q_u16(in - po2); - p1 = constrain16(tap[3], s, pri_strength, pri_damping); - - // sum += pri_taps[1] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1])); - if (clipping_required) { - max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - } - } - - if (enable_secondary) { - uint16x8_t tap[8]; - - // Secondary near taps - tap[0] = vld1q_u16(in + s1o1); - tap[1] = vld1q_u16(in - s1o1); - tap[2] = vld1q_u16(in + s2o1); - tap[3] = vld1q_u16(in - s2o1); - int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping); - int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping); - int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping); - int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping); - - // sum += sec_taps[0] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0])); - - // Secondary far taps - tap[4] = vld1q_u16(in + s1o2); - tap[5] = vld1q_u16(in - s1o2); - tap[6] = vld1q_u16(in + s2o2); - tap[7] = vld1q_u16(in - s2o2); - p0 = constrain16(tap[4], s, sec_strength, sec_damping); - p1 = constrain16(tap[5], s, sec_strength, sec_damping); - p2 = constrain16(tap[6], s, sec_strength, sec_damping); - p3 = constrain16(tap[7], s, sec_strength, sec_damping); - - // sum += sec_taps[1] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1])); - - if (clipping_required) { - max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - min = vminq_u16(min, tap[4]); - min = vminq_u16(min, tap[5]); - min = vminq_u16(min, tap[6]); - min = vminq_u16(min, tap[7]); - } - } - - // res = row + ((sum - (sum < 0) + 8) >> 4) - sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); - int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8)); - res = vshrq_n_s16(res, 4); - res = vaddq_s16(vreinterpretq_s16_u16(s), res); - if (clipping_required) { - res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), - vreinterpretq_s16_u16(max)); - } - - if (is_lowbd) { - const uint8x8_t res_128 = vqmovun_s16(res); - vst1_u8(dst8, res_128); - } else { - vst1q_u16(dst16, vreinterpretq_u16_s16(res)); - } - - in += CDEF_BSTRIDE; - dst8 += dstride; - dst16 += dstride; - } while (--h != 0); -} - -static INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int height) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; - - int h = height; - do { - const uint16x8_t row = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); - if (is_lowbd) { - const uint8x8_t res_128 = vqmovn_u16(row); - store_unaligned_u8_4x2(dst8, dstride, res_128); - } else { - store_unaligned_u16_4x2(dst16, dstride, row); - } - - in += 2 * CDEF_BSTRIDE; - dst8 += 2 * dstride; - dst16 += 2 * dstride; - h -= 2; - } while (h != 0); -} - -static INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int height) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; - - int h = height; - do { - const uint16x8_t row = vld1q_u16(in); - if (is_lowbd) { - const uint8x8_t res_128 = vqmovn_u16(row); - vst1_u8(dst8, res_128); - } else { - vst1q_u16(dst16, row); - } - - in += CDEF_BSTRIDE; - dst8 += dstride; - dst16 += dstride; - } while (--h != 0); + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]); } void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -795,16 +725,81 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { - filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -812,16 +807,91 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -837,9 +907,30 @@ (void)coeff_shift; (void)block_width; if (block_width == 8) { - copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + const uint8x8_t res = vqmovn_u16(s); + vst1_u8(dst8, res); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); } else { - copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + const uint8x8_t res = vqmovn_u16(s); + store_u8x4_strided_x2(dst8, dstride, res); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -847,16 +938,213 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -864,16 +1152,78 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -881,16 +1231,89 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -906,8 +1329,27 @@ (void)coeff_shift; (void)block_width; if (block_width == 8) { - copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + vst1q_u16(dst16, s); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + store_u16x4_strided_x2(dst16, dstride, s); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } }
diff --git a/av1/common/arm/compound_convolve_neon.c b/av1/common/arm/compound_convolve_neon.c index 2e6af68..6a59623 100644 --- a/av1/common/arm/compound_convolve_neon.c +++ b/av1/common/arm/compound_convolve_neon.c
@@ -336,10 +336,8 @@ dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); - store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; @@ -425,10 +423,8 @@ vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); - store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; @@ -647,7 +643,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(dst8_ptr, d01, 0); + store_u8_4x1(dst8_ptr, d01); src_ptr += src_stride; dst_ptr += dst_stride; @@ -860,7 +856,7 @@ uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(dst8_ptr, d01, 0); + store_u8_4x1(dst8_ptr, d01); src_ptr += src_stride; dst_ptr += dst_stride; @@ -1321,10 +1317,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -1348,7 +1342,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; @@ -1540,10 +1534,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -1566,7 +1558,7 @@ uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; @@ -1998,10 +1990,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -2029,7 +2019,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; @@ -2278,10 +2268,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -2308,7 +2296,7 @@ uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2;
diff --git a/av1/common/arm/compound_convolve_neon.h b/av1/common/arm/compound_convolve_neon.h index cff6838..d719680 100644 --- a/av1/common/arm/compound_convolve_neon.h +++ b/av1/common/arm/compound_convolve_neon.h
@@ -282,10 +282,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -308,7 +306,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; @@ -437,10 +435,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -462,7 +458,7 @@ uint8x8_t d01_u8; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; @@ -761,10 +757,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -789,7 +783,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; @@ -924,10 +918,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -951,7 +943,7 @@ uint8x8_t d01_u8; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1;
diff --git a/av1/common/arm/compound_convolve_neon_dotprod.c b/av1/common/arm/compound_convolve_neon_dotprod.c index 8ab613d..40befdf 100644 --- a/av1/common/arm/compound_convolve_neon_dotprod.c +++ b/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -80,17 +80,15 @@ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; - const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2)); // Dot product constants and other shims. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // Fold horiz_const into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const + - (1 << ((ROUND0_BITS - 1) - 1))); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; @@ -334,15 +332,14 @@ // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -380,10 +377,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; @@ -457,15 +452,14 @@ // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -503,10 +497,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; @@ -578,15 +570,14 @@ // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the vilter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset;
diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c index 70d7da9..a72af9e 100644 --- a/av1/common/arm/compound_convolve_neon_i8mm.c +++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -335,10 +335,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; @@ -450,10 +448,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride;
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c index fa98922..70cf23b 100644 --- a/av1/common/arm/convolve_neon.c +++ b/av1/common/arm/convolve_neon.c
@@ -121,10 +121,8 @@ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - store_u8_4x1(d + 0 * dst_stride, d01, 0); - store_u8_4x1(d + 1 * dst_stride, d01, 1); - store_u8_4x1(d + 2 * dst_stride, d23, 0); - store_u8_4x1(d + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(d, dst_stride, d01); + store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -178,7 +176,7 @@ uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0))); - store_u8_4x1(d, dd0, 0); + store_u8_4x1(d, dd0); s += 4; d += 4; @@ -190,18 +188,95 @@ #endif // AOM_ARCH_AARCH64 } -static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, +static INLINE uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, - const int16x4_t horiz_const) { - int16x4_t sum = horiz_const; - sum = vmla_lane_s16(sum, s0, filter, 0); - sum = vmla_lane_s16(sum, s1, filter, 1); - sum = vmla_lane_s16(sum, s2, filter, 2); - sum = vmla_lane_s16(sum, s3, filter, 3); + int16x8_t horiz_const) { + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, filter, 0); + sum = vmlaq_lane_s16(sum, s1, filter, 1); + sum = vmlaq_lane_s16(sum, s2, filter, 2); + sum = vmlaq_lane_s16(sum, s3, filter, 3); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} - // We halved the convolution filter values so - 1 from the right shift. - return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1); +static INLINE void convolve_x_sr_4tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16_t *x_filter_ptr) { + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); + + if (w == 4) { + do { + uint8x8_t t01[4]; + t01[0] = load_unaligned_u8(src_ptr + 0, src_stride); + t01[1] = load_unaligned_u8(src_ptr + 1, src_stride); + t01[2] = load_unaligned_u8(src_ptr + 2, src_stride); + t01[3] = load_unaligned_u8(src_ptr + 3, src_stride); + + int16x8_t s01[4]; + s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); + s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); + s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); + s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); + + uint8x8_t d01 = + convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + int width = w; + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + + do { + uint8x8_t t0[4], t1[4]; + load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); + load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); + + int16x8_t s0[4], s1[4]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); + s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); + s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); + s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); + + s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); + s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); + s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); + s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); + + uint8x8_t d0 = + convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); + uint8x8_t d1 = + convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const); + + store_u8_8x2(d, dst_stride, d0, d1); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 2 * src_stride; + dst_ptr += 2 * dst_stride; + h -= 2; + } while (h != 0); + } } static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, @@ -244,12 +319,20 @@ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); - if (filter_params_x->taps > 8) { + int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_taps > 8) { convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } + if (filter_taps <= 4) { + convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single // rounding right shift by FILTER_BITS - instead of a first rounding right // shift by ROUND0_BITS, followed by second rounding right shift by @@ -257,149 +340,220 @@ // The outermost -1 is needed because we will halve the filter values. const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); - if (w <= 4) { - // 4-tap filters are used for blocks having width <= 4. - // Filter values are even, so halve to reduce intermediate precision reqs. - const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); - - src += 2; - - do { - uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 - int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - - int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 - int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 - int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 - - uint8x8_t d0 = - convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const)); - - store_u8_4x1(dst, d0, 0); - - src += src_stride; - dst += dst_stride; - } while (--h != 0); - } else { - // Filter values are even so halve to reduce precision requirements. - const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + // Filter values are even so halve to reduce precision requirements. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 - while (h >= 8) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; - load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + while (h >= 8) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int width = w; + const uint8_t *s = src + 7; + uint8_t *d = dst; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + do { + uint8x8_t t8, t9, t10, t11, t12, t13, t14; + load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); + + transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, + &t14); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + + uint8x8_t d0 = + convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); + uint8x8_t d1 = + convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const); + uint8x8_t d2 = + convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const); + uint8x8_t d3 = + convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const); + uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, + horiz_const); + uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, + horiz_const); + uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, + horiz_const); + uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (h-- != 0) { + uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int width = w; + const uint8_t *s = src + 8; + uint8_t *d = dst; + + __builtin_prefetch(d); + + do { + uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint8x8_t d0 = + convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); + + vst1_u8(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } +} + +static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t filter) { + int16x8_t sum = vmulq_lane_s16(s0, filter, 0); + sum = vmlaq_lane_s16(sum, s1, filter, 1); + sum = vmlaq_lane_s16(sum, s2, filter, 2); + sum = vmlaq_lane_s16(sum, s3, filter, 3); + + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, int w, int h, + const int16_t *filter_y) { + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1); + + if (w == 4) { + uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride); + + int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + + src += 2 * src_stride; + + do { + uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride); + uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride); + uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride); + + int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23)); + int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34)); + int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45)); + int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56)); + + uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter); + uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + s01 = s45; + s12 = s56; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + uint8x8_t t0, t1, t2; + load_u8_8x3(src, src_stride, &t0, &t1, &t2); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - int width = w; - const uint8_t *s = src + 7; + int height = h; + const uint8_t *s = src + 3 * src_stride; uint8_t *d = dst; - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(d + 4 * dst_stride); - __builtin_prefetch(d + 5 * dst_stride); - __builtin_prefetch(d + 6 * dst_stride); - __builtin_prefetch(d + 7 * dst_stride); - do { - uint8x8_t t8, t9, t10, t11, t12, t13, t14; - load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); + uint8x8_t t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, - &t14); - int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); - int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); - int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); - int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); - int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); - int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); - int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); - int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); - uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, - horiz_const); - uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, - horiz_const); - uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, - horiz_const); - uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, - horiz_const); - uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, - horiz_const); - uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, - x_filter, horiz_const); - uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, - x_filter, horiz_const); - uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, - x_filter, horiz_const); + uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter); + uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter); + uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter); + uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter); - transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + s0 = s4; + s1 = s5; + s2 = s6; - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } -#endif // AOM_ARCH_AARCH64 - - while (h-- != 0) { - uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 - int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - - int width = w; - const uint8_t *s = src + 8; - uint8_t *d = dst; - - __builtin_prefetch(d); - - do { - uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 - int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); - - int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 - int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 - int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 - int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 - int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 - int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 - int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 - - uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, - horiz_const); - - vst1_u8(d, d0); - - s0 = s8; - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += src_stride; - dst += dst_stride; - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } } @@ -479,10 +633,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -501,7 +653,7 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -665,10 +817,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -689,7 +839,7 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -885,10 +1035,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -982,7 +1130,7 @@ } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); - const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int vert_offset = clamped_y_taps / 2 - 1; src -= vert_offset * src_stride; @@ -999,7 +1147,10 @@ // Filter values are even so halve to reduce precision requirements. const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); - if (y_filter_taps < 8) { + if (y_filter_taps <= 4) { + convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); + } else if (y_filter_taps == 6) { convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); } else { convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); @@ -1156,18 +1307,122 @@ } while (--h != 0); } -static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, +static INLINE int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, - const int16x4_t horiz_const) { - int16x4_t sum = horiz_const; - sum = vmla_lane_s16(sum, s0, filter, 0); - sum = vmla_lane_s16(sum, s1, filter, 1); - sum = vmla_lane_s16(sum, s2, filter, 2); - sum = vmla_lane_s16(sum, s3, filter, 3); + const int16x8_t horiz_const) { + int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0); + sum = vmlaq_lane_s16(sum, s1, filter, 1); + sum = vmlaq_lane_s16(sum, s2, filter, 2); + sum = vmlaq_lane_s16(sum, s3, filter, 3); + // We halved the filter values so -1 from right shift. + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} - // We halved the convolution filter values so -1 from the right shift. - return vshr_n_s16(sum, ROUND0_BITS - 1); +static INLINE void convolve_2d_sr_horiz_4tap_neon( + const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { + const int bd = 8; + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); + + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + if (w == 4) { + do { + uint8x8_t t01[4]; + t01[0] = load_unaligned_u8(src + 0, (int)src_stride); + t01[1] = load_unaligned_u8(src + 1, (int)src_stride); + t01[2] = load_unaligned_u8(src + 2, (int)src_stride); + t01[3] = load_unaligned_u8(src + 3, (int)src_stride); + + int16x8_t s01[4]; + s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); + s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); + s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); + s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); + + int16x8_t d01 = + convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const); + + store_s16x4_strided_x2(dst, (int)dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + int width = w; + const uint8_t *s = src; + int16_t *d = dst; + + do { + uint8x8_t t0[4], t1[4]; + load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); + load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); + + int16x8_t s0[4]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); + s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); + s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); + s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); + + int16x8_t s1[4]; + s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); + s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); + s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); + s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); + + int16x8_t d0 = + convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); + int16x8_t d1 = + convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const); + + store_s16_8x2(d, dst_stride, d0, d1); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 2); + + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x8_t t0[4]; + load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]); + + int16x8_t s0[4]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); + s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); + s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); + s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); + + int16x8_t d0 = + convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); + + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } } static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, @@ -1193,10 +1448,9 @@ return vshrq_n_s16(sum, ROUND0_BITS - 1); } -static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride, - int16_t *im_block, int im_stride, - int w, int im_h, - const int16_t *x_filter_ptr) { +static INLINE void convolve_2d_sr_horiz_8tap_neon( + const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, + int im_h, const int16_t *x_filter_ptr) { const int bd = 8; const uint8_t *src_ptr = src; @@ -1204,149 +1458,119 @@ int dst_stride = im_stride; int height = im_h; - if (w <= 4) { - // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding - // shifts - which are generally faster than rounding shifts on modern CPUs. - // (The extra -1 is needed because we halved the filter values.) - const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + - (1 << ((ROUND0_BITS - 1) - 1))); - // 4-tap filters are used for blocks having width <= 4. - // Filter values are even, so halve to reduce intermediate precision reqs. - const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); - - src_ptr += 2; - - do { - uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 - int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - - int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 - int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 - int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 - - int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); - - vst1_s16(dst_ptr, d0); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--height != 0); - } else { - // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding - // shifts - which are generally faster than rounding shifts on modern CPUs. - // (The extra -1 is needed because we halved the filter values.) - const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + - (1 << ((ROUND0_BITS - 1) - 1))); - // Filter values are even, so halve to reduce intermediate precision reqs. - const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 - while (height > 8) { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; + while (height > 8) { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s += 7; - - do { - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - - transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - - int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, - x_filter, horiz_const); - int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, - x_filter, horiz_const); - int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, - x_filter, horiz_const); - int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, - x_filter, horiz_const); - int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, - x_filter, horiz_const); - int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, - x_filter, horiz_const); - int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, - x_filter, horiz_const); - int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, - x_filter, horiz_const); - - transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - - store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += 8 * src_stride; - dst_ptr += 8 * dst_stride; - height -= 8; - } -#endif // AOM_ARCH_AARCH64 + s += 7; do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 - int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - do { - uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15 - int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 - int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 - int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 - int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 - int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 - int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 - int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + horiz_const); + int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, horiz_const); + int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); - int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, - x_filter, horiz_const); + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - vst1q_s16(d, d0); + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - s0 = s8; - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--height != 0); + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; } +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + do { + uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const); + + vst1q_s16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); } void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, @@ -1363,7 +1587,8 @@ } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); - const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; @@ -1393,12 +1618,20 @@ DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, - x_filter_ptr); + if (x_filter_taps <= 4) { + convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr); + } else { + convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride, + w, im_h, x_filter_ptr); + } const int16x8_t y_filter = vld1q_s16(y_filter_ptr); - if (clamped_y_taps <= 6) { + if (clamped_y_taps <= 4) { + convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_ptr); + } else if (clamped_y_taps == 6) { convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } else { @@ -1431,11 +1664,11 @@ uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); if (w == 2) { - store_u8_2x1(dst + 0 * dst_stride, d0, 0); - store_u8_2x1(dst + 1 * dst_stride, d1, 0); + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); } else { - store_u8_4x1(dst + 0 * dst_stride, d0, 0); - store_u8_4x1(dst + 1 * dst_stride, d1, 0); + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); } src += 2 * src_stride; @@ -1502,11 +1735,11 @@ uint8x8_t d1 = vrhadd_u8(s1, s2); if (w == 2) { - store_u8_2x1(dst + 0 * dst_stride, d0, 0); - store_u8_2x1(dst + 1 * dst_stride, d1, 0); + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); } else { - store_u8_4x1(dst + 0 * dst_stride, d0, 0); - store_u8_4x1(dst + 1 * dst_stride, d1, 0); + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); } src += 2 * src_stride; @@ -1626,14 +1859,15 @@ uint16x4_t sum0 = vadd_u16(s0, s1); uint16x4_t sum1 = vadd_u16(s1, s2); - uint8x8_t d01 = vqrshrn_n_u16(vcombine_u16(sum0, sum1), 2); + uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2); + uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2); if (w == 2) { - store_u8_2x1(dst + 0 * dst_stride, d01, 0); - store_u8_2x1(dst + 1 * dst_stride, d01, 2); + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); } else { - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); } im += 2 * im_stride;
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h index 6b8edf8..5a9f8b6 100644 --- a/av1/common/arm/convolve_neon.h +++ b/av1/common/arm/convolve_neon.h
@@ -127,10 +127,8 @@ uint8x8_t d01 = vqmovun_s16(dd01); uint8x8_t d23 = vqmovun_s16(dd23); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -282,10 +280,8 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -303,7 +299,7 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -452,10 +448,8 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -471,7 +465,7 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -541,4 +535,112 @@ } } +static INLINE int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t y_filter) { + int32x4_t sum = vmull_lane_s16(s0, y_filter, 0); + sum = vmlal_lane_s16(sum, s1, y_filter, 1); + sum = vmlal_lane_s16(sum, s2, y_filter, 2); + sum = vmlal_lane_s16(sum, s3, y_filter, 3); + + return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t y_filter, + const int16x8_t sub_const) { + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *y_filter) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + const int16x4_t filter = vld1_s16(y_filter + 2); + + if (w == 4) { + int16x4_t s0, s1, s2; + load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2); + src_ptr += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6); + + int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter); + int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter); + int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter); + int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const); + uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const); + uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const); + uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + #endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c index ba8f7e7..32b056d 100644 --- a/av1/common/arm/convolve_neon_dotprod.c +++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -21,233 +21,305 @@ #include "av1/common/convolve.h" #include "av1/common/filter.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + static INLINE int16x4_t convolve12_4_x(uint8x16_t samples, const int8x16_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum; - - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; - // Accumulate dot product into 'correction' to account for range clamp. - // First 4 output values. - sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0); - sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1); - sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2); + // Dot product constants: + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + int32x4_t acc = + vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))); + + int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); + sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1); + sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2); return vqrshrn_n_s32(sum, FILTER_BITS); } static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2], const int8x16_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples[2], permuted_samples[4]; - int32x4_t sum[2]; - - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit)); - clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit)); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128[2] = { + vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), + vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) + }; // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]); // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } - permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]); + int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]), + vqtbl1q_s8(samples_128[0], permute_tbl.val[1]), + vqtbl1q_s8(samples_128[0], permute_tbl.val[2]), + vqtbl1q_s8(samples_128[1], + permute_tbl.val[2]) }; - // Accumulate dot product into 'correction' to account for range clamp. - // First 4 output values. - sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0); - sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1); - sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2); - // Second 4 output values. - sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0); - sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1); - sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2); + // Dot product constants: + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + int32x4_t acc = + vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))); + + int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); + sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1); + sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2); + + int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0); + sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1); + sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2); // Narrow and re-pack. - int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS), - vqrshrn_n_s32(sum[1], FILTER_BITS)); + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS), + vqrshrn_n_s32(sum4567, FILTER_BITS)); return vqmovun_s16(sum_s16); } static INLINE void convolve_x_sr_12tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { + // The no-op filter should never be used here. + assert(x_filter_ptr[5] != 128); + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); - const int32_t correction_s32 = - vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)), - vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS)))); - // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right - // shift by FILTER_BITS - instead of a first rounding right shift by - // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - - // ROUND0_BITS. - int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1))); - const uint8x16_t range_limit = vdupq_n_u8(128); - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(filter_0_7, 5) == 128) { - // Undo the horizontal offset in the calling function. - src += 5; + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl); + int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl); + int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl); + int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0, 0); - } else { - vst1_u8(d, d0); - } + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl); + uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl); + uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl); + uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; - } while (width > 0); - src += src_stride; - dst += dst_stride; - } while (--h != 0); - } else { - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = - convolve12_4_x(s0, filter, correction, range_limit, permute_tbl); - int16x4_t d1 = - convolve12_4_x(s1, filter, correction, range_limit, permute_tbl); - int16x4_t d2 = - convolve12_4_x(s2, filter, correction, range_limit, permute_tbl); - int16x4_t d3 = - convolve12_4_x(s3, filter, correction, range_limit, permute_tbl); - - uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); - uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); - - dst += 4 * dst_stride; - src += 4 * src_stride; - h -= 4; - } while (h != 0); - } else { - do { - const uint8_t *s = src; - uint8_t *d = dst; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - uint8x8_t d0 = - convolve12_8_x(s0, filter, correction, range_limit, permute_tbl); - uint8x8_t d1 = - convolve12_8_x(s1, filter, correction, range_limit, permute_tbl); - uint8x8_t d2 = - convolve12_8_x(s2, filter, correction, range_limit, permute_tbl); - uint8x8_t d3 = - convolve12_8_x(s3, filter, correction, range_limit, permute_tbl); - - store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); } } -static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, +static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples, + const int8x8_t filters, const uint8x16_t permute_tbl) { - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - int8x16_t clamped_samples = - vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); - // Accumulate dot product into 'correction' to account for range clamp. - int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0); + // Dot product constants: + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we halved the filter values. + int32x4_t acc = + vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); - // Packing is performed by the caller. + // Further narrowing and packing is performed by the caller. return vmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum[2]; +static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Dot product constants: + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we halved the filter values. + int32x4_t acc = + vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); + + int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_x_sr_4tap_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) { + const int16x4_t x_filter = vld1_s16(filter_x + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + if (width == 4) { + const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve4_4_x(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_x(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_x(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_x(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int w = width; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve4_8_x(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_x(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_x(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_x(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. */ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; - // Accumulate dot product into 'correction' to account for range clamp. - // First 4 output values. - sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1); - // Second 4 output values. - sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); - sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1); + // Dot product constants: + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we halved the filter values. + int32x4_t acc = + vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); + + int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0); + sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1); + + int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filter, 0); + sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filter, 1); // Narrow and re-pack. - int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1])); + int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); } @@ -269,120 +341,572 @@ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); - if (filter_params_x->taps > 8) { + int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_taps > 8) { convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } + if (filter_taps <= 4) { + convolve_x_sr_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - // Dot product constants. - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single - // rounding right shift by FILTER_BITS - instead of a first rounding right - // shift by ROUND0_BITS, followed by second rounding right shift by - // FILTER_BITS - ROUND0_BITS. - // The outermost -1 is needed because we will halve the filter values. - const int32x4_t correction = - vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1))); - const uint8x16_t range_limit = vdupq_n_u8(128); - if (w <= 4) { - const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); - // 4-tap filters are used for blocks having width <= 4. - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = - vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); - src += 2; + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl); + uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); +} + +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; + + *b = vreinterpretq_s8_s16(a0123); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8x2_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); + + *b0 = vreinterpretq_s8_s16(a0123.val[0]); + *b1 = vreinterpretq_s8_s16(a0123.val[1]); +} + +static INLINE int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1, + const int8x16_t s2, + const int8x8_t filters_0_7, + const int8x8_t filters_4_11) { + // The sample range transform and permutation are performed by the caller. + // Accumulate into 128 << FILTER_BITS to account for range transform. + const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); + int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0); + sum = vdotq_lane_s32(sum, s1, filters_0_7, 1); + sum = vdotq_lane_s32(sum, s2, filters_4_11, 1); + + // Further narrowing and packing is performed by the caller. + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve12_8_y( + const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, + const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi, + const int8x8_t filters_0_7, const int8x8_t filters_4_11) { + // The sample range transform and permutation are performed by the caller. + // Accumulate into 128 << FILTER_BITS to account for range transform. + const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); + + int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0); + sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); + sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); + + int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0); + sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); + sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE void convolve_y_sr_12tap_neon_dotprod( + const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr) { + // The no-op filter should never be used here. + assert(y_filter_ptr[5] != 128); + + const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); + const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; + load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, + &t8, &t9, &tA); + src_ptr += 11 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); + + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_4x4(s4, s5, s6, s7, &s4567); + transpose_concat_4x4(s5, s6, s7, s8, &s5678); + transpose_concat_4x4(s6, s7, s8, s9, &s6789); + transpose_concat_4x4(s7, s8, s9, sA, &s789A); + + do { + uint8x8_t tB, tC, tD, tE; + load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE); + + int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); + int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); + int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); + int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); + + int8x16_t s89AB, s9ABC, sABCD, sBCDE; + transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s789A, sBCDE } }; + s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = - convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); int16x4_t d1 = - convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); int16x4_t d2 = - convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); int16x4_t d3 = - convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - // We halved the convolution filter values so - 1 from the right shift. - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s789A; + s4567 = s89AB; + s5678 = s9ABC; + s6789 = sABCD; + s789A = sBCDE; - src += 4 * src_stride; - dst += 4 * dst_stride; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); - do { - int width = w; - const uint8_t *s = src; - uint8_t *d = dst; + int height = h; + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; + load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, + &t9, &tA); + s += 11 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s789A_lo, s789A_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + uint8x8_t tB, tC, tD, tE; + load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); + + int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); + int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); + int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); + int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); + + int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, + sBCDE_lo, sBCDE_hi; + transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; + s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); + s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); + sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); + + int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; + s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); + s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); + sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); uint8x8_t d0 = - convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, + s89AB_hi, filter_0_7, filter_4_11); uint8x8_t d1 = - convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, + s9ABC_hi, filter_0_7, filter_4_11); uint8x8_t d2 = - convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, + sABCD_hi, filter_0_7, filter_4_11); uint8x8_t d3 = - convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, + sBCDE_hi, filter_0_7, filter_4_11); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s789A_lo; + s3456_hi = s789A_hi; + s4567_lo = s89AB_lo; + s4567_hi = s89AB_hi; + s5678_lo = s9ABC_lo; + s5678_hi = s9ABC_hi; + s6789_lo = sABCD_lo; + s6789_hi = sABCD_hi; + s789A_lo = sBCDE_lo; + s789A_hi = sBCDE_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + // Accumulate into 128 << FILTER_BITS to account for range transform. + const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); + int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0); + sum = vdotq_lane_s32(sum, s1, filters, 1); + + // Further narrowing and packing is performed by the caller. + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_y(const int8x16_t s0_lo, + const int8x16_t s0_hi, + const int8x16_t s1_lo, + const int8x16_t s1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + // Accumulate into 128 << FILTER_BITS to account for range transform. + const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); + + int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0); + sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1); + + int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters, 0); + sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE void convolve_y_sr_8tap_neon_dotprod( + const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr) { + const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src_ptr += 7 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &t10); + + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); + + int8x16_t s4567, s5678, s6789, s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_y(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); + } else { + do { + int height = h; + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); + + int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } }; + s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); + + int8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } }; + s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); } } +void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + if (w == 2 || h == 2) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, + subpel_y_qn); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + + if (y_filter_taps <= 6) { + av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + return; + } + + const int vert_offset = y_filter_taps / 2 - 1; + src -= vert_offset * src_stride; + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (y_filter_taps > 8) { + convolve_y_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); + return; + } + + convolve_y_sr_8tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); +} + static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, const int8x16_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, + const int32x4_t horiz_const, const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum; - - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; - // Accumulate dot product into 'correction' to account for range clamp. - // First 4 output values. - sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0); - sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1); - sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2); + // Accumulate dot product into 'correction' to account for range transform. + int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0); + sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1); + sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2); // Narrow and re-pack. return vshrn_n_s32(sum, ROUND0_BITS); @@ -391,317 +915,103 @@ static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], const int8x16_t filters, const int32x4_t correction, - const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples[2], permuted_samples[4]; - int32x4_t sum[2]; - - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit)); - clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit)); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128[2] = { + vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), + vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) + }; // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]); // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } - permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]); + int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]), + vqtbl1q_s8(samples_128[0], permute_tbl.val[1]), + vqtbl1q_s8(samples_128[0], permute_tbl.val[2]), + vqtbl1q_s8(samples_128[1], + permute_tbl.val[2]) }; - // Accumulate dot product into 'correction' to account for range clamp. - // First 4 output values. - sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0); - sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1); - sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2); - // Second 4 output values. - sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0); - sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1); - sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2); + // Accumulate dot product into 'correction' to account for range transform. + int32x4_t sum0123 = vdotq_laneq_s32(correction, perm_samples[0], filters, 0); + sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filters, 1); + sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filters, 2); + + int32x4_t sum4567 = vdotq_laneq_s32(correction, perm_samples[1], filters, 0); + sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filters, 1); + sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filters, 2); // Narrow and re-pack. - return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS), - vshrn_n_s32(sum[1], ROUND0_BITS)); + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS), + vshrn_n_s32(sum4567, ROUND0_BITS)); } static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11) { + // The no-op filter should never be used here. + assert(vgetq_lane_s16(x_filter_0_7, 5) != 128); + const int bd = 8; - // Special case the following no-op filter as 128 won't fit into the 8-bit - // signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { - const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); - // Undo the horizontal offset in the calling function. - src_ptr += 5; + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x8_t s0 = vld1_u8(s); - uint16x8_t d0 = vaddw_u8(horiz_const, s0); - d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); - // Store 8 elements to avoid additional branches. This is safe if the - // actual block width is < 8 because the intermediate buffer is large - // enough to accommodate 128x128 blocks. - vst1q_s16(d, vreinterpretq_s16_u16(d0)); - - d += 8; - s += 8; - width -= 8; - } while (width > 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - - } else { - // Narrow filter values to 8-bit. - const int16x8x2_t x_filter_s16 = { - { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } - }; - const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), - vmovn_s16(x_filter_s16.val[1])); - - // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // - which are generally faster than rounding shifts on modern CPUs. - const int32_t horiz_const = - ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); - // Dot product constants. - const int32x4_t correct_tmp = - vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)), - vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7))); - const int32x4_t correction = - vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const); - const uint8x16_t range_limit = vdupq_n_u8(128); - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, - permute_tbl); - int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit, - permute_tbl); - int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit, - permute_tbl); - int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit, - permute_tbl); - - store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); - - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - uint8x16_t s0 = vld1q_u8(src_ptr); - int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, - permute_tbl); - vst1_s16(dst_ptr, d0); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - - } else { - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, - range_limit, permute_tbl); - int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, - range_limit, permute_tbl); - int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, - range_limit, permute_tbl); - int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, - range_limit, permute_tbl); - - store_s16_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2]; - s0[0] = vld1q_u8(s); - s0[1] = vld1q_u8(s + 4); - int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, - range_limit, permute_tbl); - vst1q_s16(d, d0); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } - } -} - -static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16_t permute_tbl) { - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - int8x16_t clamped_samples = - vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - // Permute samples ready for dot product. - // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); - - // Accumulate dot product into 'correction' to account for range clamp. - int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); - - // We halved the convolution filter values so -1 from the right shift. - return vshrn_n_s32(sum, ROUND0_BITS - 1); -} - -static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum[2]; - - // Clamp sample range to [-128, 127] for 8-bit signed dot product. - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - // Permute samples ready for dot product. - // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); - - // Accumulate dot product into 'correction' to account for range clamp. - // First 4 output values. - sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1); - // Second 4 output values. - sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); - sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1); - - // Narrow and re-pack. - // We halved the convolution filter values so -1 from the right shift. - return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), - vshrn_n_s32(sum[1], ROUND0_BITS - 1)); -} - -static INLINE void convolve_2d_sr_horiz_neon_dotprod( - const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, - int im_h, const int16_t *x_filter_ptr) { - const int bd = 8; - // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. - // The outermost -1 is needed because we halved the filter values. const int32_t horiz_const = - ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Dot product constants. - const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const); - const uint8x16_t range_limit = vdupq_n_u8(128); - - const uint8_t *src_ptr = src; - int16_t *dst_ptr = im_block; - int dst_stride = im_stride; - int height = im_h; + const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); if (w <= 4) { - const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); - // 4-tap filters are used for blocks having width <= 4. - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = - vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); - - src_ptr += 2; - do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); - int16x4_t d0 = - convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); - int16x4_t d1 = - convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); - int16x4_t d2 = - convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); - int16x4_t d3 = - convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl); + int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, permute_tbl); + int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, permute_tbl); + int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, permute_tbl); store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; - height -= 4; - } while (height > 4); + h -= 4; + } while (h > 4); do { uint8x16_t s0 = vld1q_u8(src_ptr); - int16x4_t d0 = - convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl); vst1_s16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; - } while (--height != 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + } while (--h != 0); + } else { do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, - permute_tbl); - int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit, - permute_tbl); - int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit, - permute_tbl); - int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit, - permute_tbl); + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl); + int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, permute_tbl); + int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, permute_tbl); + int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, permute_tbl); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); @@ -711,8 +1021,8 @@ } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; - height -= 4; - } while (height > 4); + h -= 4; + } while (h > 4); do { const uint8_t *s = src_ptr; @@ -720,9 +1030,10 @@ int width = w; do { - uint8x16_t s0 = vld1q_u8(s); - int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, - permute_tbl); + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl); vst1q_s16(d, d0); s += 8; @@ -731,7 +1042,436 @@ } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; - } while (--height != 0); + } while (--h != 0); + } +} + +static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl, + const int32x4_t correction) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); + + // Accumulate into 'correction' to account for range transform. + int32x4_t sum = vdotq_lane_s32(correction, perm_samples, filters, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl, + const int32x4_t correction) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 'correction' to account for range transform. + int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0); + int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0); + + // Narrow and re-pack. + // We halved the filter values so -1 from right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_4tap_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { + const int bd = 8; + const int16x4_t x_filter = vld1_s16(filter_x + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Halve the total because we halved the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction); + int16x4_t d1 = convolve4_4_2d_h(s1, filter, permute_tbl, correction); + int16x4_t d2 = convolve4_4_2d_h(s2, filter, permute_tbl, correction); + int16x4_t d3 = convolve4_4_2d_h(s3, filter, permute_tbl, correction); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src); + int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction); + vst1_s16(dst, d0); + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction); + int16x8_t d1 = convolve4_8_2d_h(s1, filter, permute_tbl, correction); + int16x8_t d2 = convolve4_8_2d_h(s2, filter, permute_tbl, correction); + int16x8_t d3 = convolve4_8_2d_h(s3, filter, permute_tbl, correction); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + const uint8_t *s = src; + int16_t *d = dst; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; + + // Accumulate dot product into 'correction' to account for range transform. + int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0); + sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1); + + int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0); + sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filters, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_8tap_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, + int im_h, const int16_t *x_filter_ptr) { + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + const int bd = 8; + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we halved the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, permute_tbl); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, permute_tbl); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE void convolve_2d_sr_6tap_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + const int bd = 8; + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Halve the total because we halved the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); + const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; + load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); + s += 5 * src_stride; + + int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl); + int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl); + int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl); + int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl); + int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl); + + do { + uint8x16_t h_s5, h_s6, h_s7, h_s8; + load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); + + int16x8_t v_s5 = + convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl); + int16x8_t v_s6 = + convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl); + int16x8_t v_s7 = + convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl); + int16x8_t v_s8 = + convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl); + + uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + y_filter, vert_const); + uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + y_filter, vert_const); + uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + y_filter, vert_const); + uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + y_filter, vert_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void convolve_2d_sr_4tap_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { + const int bd = 8; + const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); + + const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); + const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1); + + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Halve the total because we halved the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, correction); + int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, correction); + int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, correction); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t v_s3 = + convolve4_4_2d_h(h_s3, x_filter, permute_tbl, correction); + int16x4_t v_s4 = + convolve4_4_2d_h(h_s4, x_filter, permute_tbl, correction); + int16x4_t v_s5 = + convolve4_4_2d_h(h_s5, x_filter, permute_tbl, correction); + int16x4_t v_s6 = + convolve4_4_2d_h(h_s6, x_filter, permute_tbl, correction); + + int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter); + int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter); + int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter); + int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + + do { + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x8_t v_s0 = + convolve4_8_2d_h(h_s0, x_filter, permute_tbl, correction); + int16x8_t v_s1 = + convolve4_8_2d_h(h_s1, x_filter, permute_tbl, correction); + int16x8_t v_s2 = + convolve4_8_2d_h(h_s2, x_filter, permute_tbl, correction); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x8_t v_s3 = + convolve4_8_2d_h(h_s3, x_filter, permute_tbl, correction); + int16x8_t v_s4 = + convolve4_8_2d_h(h_s4, x_filter, permute_tbl, correction); + int16x8_t v_s5 = + convolve4_8_2d_h(h_s5, x_filter, permute_tbl, correction); + int16x8_t v_s6 = + convolve4_8_2d_h(h_s6, x_filter, permute_tbl, correction); + + uint8x8_t d0 = + convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const); + uint8x8_t d1 = + convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const); + uint8x8_t d2 = + convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); + uint8x8_t d3 = + convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } } @@ -750,7 +1490,8 @@ } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); - const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; @@ -778,15 +1519,35 @@ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_0_7, y_filter_8_11); } else { + if (x_filter_taps >= 6 && y_filter_taps == 6) { + convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w, + h, x_filter_ptr, y_filter_ptr); + return; + } + + if (x_filter_taps <= 4 && y_filter_taps <= 4) { + convolve_2d_sr_4tap_neon_dotprod(src_ptr + 2, src_stride, dst, dst_stride, + w, h, x_filter_ptr, y_filter_ptr); + return; + } + DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride, - w, im_h, x_filter_ptr); + if (x_filter_taps <= 4) { + convolve_2d_sr_horiz_4tap_neon_dotprod(src_ptr + 2, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr); + } else { + convolve_2d_sr_horiz_8tap_neon_dotprod(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr); + } const int16x8_t y_filter = vld1q_s16(y_filter_ptr); - if (clamped_y_taps <= 6) { + if (clamped_y_taps <= 4) { + convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_ptr); + } else if (clamped_y_taps == 6) { convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } else {
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c index 14140ca..cd989cb 100644 --- a/av1/common/arm/convolve_neon_i8mm.c +++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -18,34 +18,34 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/arm/convolve_neon.h" +#include "av1/common/arm/convolve_neon_i8mm.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { - 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, - 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; static INLINE int16x4_t convolve12_4_x(uint8x16_t samples, const int8x16_t filter, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { - uint8x16_t permuted_samples[3]; - int32x4_t sum; - // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; - // First 4 output values. - sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0); - sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1); - sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2); + int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filter, 0); + sum = vusdotq_laneq_s32(sum, perm_samples[1], filter, 1); + sum = vusdotq_laneq_s32(sum, perm_samples[2], filter, 2); return vqrshrn_n_s32(sum, FILTER_BITS); } @@ -54,31 +54,29 @@ const int8x16_t filter, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { - uint8x16_t permuted_samples[4]; - int32x4_t sum[2]; - // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]); // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } - permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]); + uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), + vqtbl1q_u8(samples[0], permute_tbl.val[1]), + vqtbl1q_u8(samples[0], permute_tbl.val[2]), + vqtbl1q_u8(samples[1], permute_tbl.val[2]) }; - // First 4 output values. - sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0); - sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1); - sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2); - // Second 4 output values. - sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0); - sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1); - sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2); + int32x4_t sum0123 = + vusdotq_laneq_s32(horiz_const, perm_samples[0], filter, 0); + sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filter, 1); + sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filter, 2); + + int32x4_t sum4567 = + vusdotq_laneq_s32(horiz_const, perm_samples[1], filter, 0); + sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filter, 1); + sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filter, 2); // Narrow and re-pack. - int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS), - vqrshrn_n_s32(sum[1], FILTER_BITS)); + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS), + vqrshrn_n_s32(sum4567, FILTER_BITS)); return vqmovun_s16(sum_s16); } @@ -86,135 +84,191 @@ int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { + // The no-op filter should never be used here. + assert(x_filter_ptr[5] != 128); + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(filter_0_7, 5) == 128) { - // Undo the horizontal offset in the calling function. - src += 5; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0, 0); - } else { - vst1_u8(d, d0); - } + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; - } while (width > 0); - src += src_stride; - dst += dst_stride; - } while (--h != 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding - // right shift by FILTER_BITS - instead of a first rounding right shift by - // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - - // ROUND0_BITS. - const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); - - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); - int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); - int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); - int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); - - uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); - uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); - - dst += 4 * dst_stride; - src += 4 * src_stride; - h -= 4; - } while (h != 0); - } else { - do { - const uint8_t *s = src; - uint8_t *d = dst; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); - uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); - uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); - uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); - - store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); } } -static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter, - const uint8x16_t permute_tbl, - const int32x4_t horiz_const) { +static INLINE int16x4_t convolve4_4_x(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); - // First 4 output values. - int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0); + // Dot product constants: + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we halved the filter values. + int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2); + int32x4_t sum = vusdotq_lane_s32(acc, perm_samples, filters, 0); - // Packing is performed by the caller. + // Further narrowing and packing is performed by the caller. return vmovn_s32(sum); } +static INLINE uint8x8_t convolve4_8_x(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + // Dot product constants: + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we halved the filter values. + int32x4_t acc = vdupq_n_s32((1 << (ROUND0_BITS - 1)) / 2); + + int32x4_t sum0123 = vusdotq_lane_s32(acc, perm_samples[0], filters, 0); + int32x4_t sum4567 = vusdotq_lane_s32(acc, perm_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_x_sr_4tap_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) { + const int16x4_t x_filter = vld1_s16(filter_x + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + if (width == 4) { + const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl); + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve4_4_x(s0, filter, perm_tbl); + int16x4_t t1 = convolve4_4_x(s1, filter, perm_tbl); + int16x4_t t2 = convolve4_4_x(s2, filter, perm_tbl); + int16x4_t t3 = convolve4_4_x(s3, filter, perm_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + + do { + int w = width; + const uint8_t *s = src; + uint8_t *d = dst; + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve4_8_x(s0, filter, perm_tbl); + uint8x8_t d1 = convolve4_8_x(s1, filter, perm_tbl); + uint8x8_t d2 = convolve4_8_x(s2, filter, perm_tbl); + uint8x8_t d3 = convolve4_8_x(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { - uint8x16_t permuted_samples[3]; - int32x4_t sum[2]; - // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; - // First 4 output values. - sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0); - sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1); - // Second 4 output values. - sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0); - sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1); + int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0); + sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1); - int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1])); + int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0); + sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1); + + int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); } @@ -236,320 +290,519 @@ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); - if (filter_params_x->taps > 8) { + int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_taps > 8) { convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } + if (filter_taps <= 4) { + convolve_x_sr_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single // rounding right shift by FILTER_BITS - instead of a first rounding right // shift by ROUND0_BITS, followed by second rounding right shift by // FILTER_BITS - ROUND0_BITS. - // The outermost -1 is needed because we will halve the filter values. + // The outermost -1 is needed because we halved the filter values. const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1)); - if (w <= 4) { - const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); - // 4-tap filters are used for blocks having width <= 4. - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = - vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); - - src += 2; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; do { uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const); - int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const); - int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const); - int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const); + uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const); - // We halved the convolution filter values so - 1 from the right shift. - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); +} - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; + + *b = vreinterpretq_u8_u16(a0123); +} + +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8x2_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); + + *b0 = vreinterpretq_u8_u16(a0123.val[0]); + *b1 = vreinterpretq_u8_u16(a0123.val[1]); +} + +static INLINE int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, + const int8x8_t filters_0_7, + const int8x8_t filters_4_11) { + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters_0_7, 0); + sum = vusdotq_lane_s32(sum, s1, filters_0_7, 1); + sum = vusdotq_lane_s32(sum, s2, filters_4_11, 1); + + // Further narrowing and packing is performed by the caller. + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve12_8_y( + const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo, + const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi, + const int8x8_t filters_0_7, const int8x8_t filters_4_11) { + int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0); + sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); + sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); + + int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0); + sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); + sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *y_filter_ptr) { + // The no-op filter should never be used here. + assert(y_filter_ptr[5] != 128); + + const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); + const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, + &s8, &s9, &sA); + src_ptr += 11 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_4x4(s4, s5, s6, s7, &s4567); + transpose_concat_4x4(s5, s6, s7, s8, &s5678); + transpose_concat_4x4(s6, s7, s8, s9, &s6789); + transpose_concat_4x4(s7, s8, s9, sA, &s789A); do { - const uint8_t *s = src; - uint8_t *d = dst; - int width = w; + uint8x8_t sB, sC, sD, sE; + load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE); + + uint8x16_t s89AB, s9ABC, sABCD, sBCDE; + transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s789A, sBCDE } }; + s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = + convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); + int16x4_t d1 = + convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); + int16x4_t d2 = + convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); + int16x4_t d3 = + convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s789A; + s4567 = s89AB; + s5678 = s9ABC; + s6789 = sABCD; + s789A = sBCDE; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s789A_lo, s789A_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + uint8x8_t sB, sC, sD, sE; + load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE); - uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const); - uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const); - uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const); - uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const); + uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, + sBCDE_lo, sBCDE_hi; + transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; + s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); + s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); + sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); + + uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; + s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); + s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); + sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, + s89AB_hi, filter_0_7, filter_4_11); + uint8x8_t d1 = + convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, + s9ABC_hi, filter_0_7, filter_4_11); + uint8x8_t d2 = + convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, + sABCD_hi, filter_0_7, filter_4_11); + uint8x8_t d3 = + convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, + sBCDE_hi, filter_0_7, filter_4_11); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s789A_lo; + s3456_hi = s789A_hi; + s4567_lo = s89AB_lo; + s4567_hi = s89AB_hi; + s5678_lo = s9ABC_lo; + s5678_hi = s9ABC_hi; + s6789_lo = sABCD_lo; + s6789_hi = sABCD_hi; + s789A_lo = sBCDE_lo; + s789A_hi = sBCDE_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); } } -static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, - const int8x16_t filters, - const uint8x16x3_t permute_tbl, - int32x4_t horiz_const) { - uint8x16_t permuted_samples[3]; - int32x4_t sum; +static INLINE int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1, + const int8x8_t filters) { + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0); + sum = vusdotq_lane_s32(sum, s1, filters, 1); - // Permute samples ready for dot product. - // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); - - // First 4 output values. - sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0); - sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1); - sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2); - - // Narrow and re-pack. - return vshrn_n_s32(sum, ROUND0_BITS); + // Further narrowing and packing is performed by the caller. + return vqmovn_s32(sum); } -static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], - const int8x16_t filters, - const uint8x16x3_t permute_tbl, - const int32x4_t horiz_const) { - uint8x16_t permuted_samples[4]; - int32x4_t sum[2]; +static INLINE uint8x8_t convolve8_8_y(const uint8x16_t s0_lo, + const uint8x16_t s0_hi, + const uint8x16_t s1_lo, + const uint8x16_t s1_hi, + const int8x8_t filters) { + int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters, 0); + sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters, 1); - // Permute samples ready for dot product. - // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]); - // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]); - // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]); - // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } - permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]); - - // First 4 output values. - sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0); - sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1); - sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2); - // Second 4 output values. - sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0); - sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1); - sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2); + int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters, 0); + sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1); // Narrow and re-pack. - return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS), - vshrn_n_s32(sum[1], ROUND0_BITS)); + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); + return vqrshrun_n_s16(sum, FILTER_BITS); } -static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm( - const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, - const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, - const int16x4_t x_filter_8_11) { - const int bd = 8; +static INLINE void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *y_filter_ptr) { + const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr)); - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { - const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); - // Undo the horizontal offset in the calling function. - src_ptr += 5; + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); - do { - uint8x8_t s0 = vld1_u8(s); - uint16x8_t d0 = vaddw_u8(horiz_const, s0); - d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); - // Store 8 elements to avoid additional branches. This is safe if the - // actual block width is < 8 because the intermediate buffer is large - // enough to accommodate 128x128 blocks. - vst1q_s16(d, vreinterpretq_s16_u16(d0)); + uint8x16_t s4567, s5678, s6789, s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - d += 8; - s += 8; - width -= 8; - } while (width > 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_y(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - // Narrow filter values to 8-bit. - const int16x8x2_t x_filter_s16 = { - { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } - }; - const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), - vmovn_s16(x_filter_s16.val[1])); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // - which are generally faster than rounding shifts on modern CPUs. - const int32x4_t horiz_const = - vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + do { + int height = h; + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; - int16x4_t d0 = - convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); - int16x4_t d1 = - convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const); - int16x4_t d2 = - convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const); - int16x4_t d3 = - convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const); - - store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); - - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { - uint8x16_t s0 = vld1q_u8(src_ptr); - int16x4_t d0 = - convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); - vst1_s16(dst_ptr, d0); + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); + uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - } else { - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } }; + s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } }; + s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); - int16x8_t d0 = - convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); - int16x8_t d1 = - convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const); - int16x8_t d2 = - convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const); - int16x8_t d3 = - convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + uint8x8_t d0 = + convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter); - store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - s += 8; - d += 8; - width -= 8; - } while (width != 0); + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2]; - s0[0] = vld1q_u8(s); - s0[1] = vld1q_u8(s + 4); - int16x8_t d0 = - convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); - vst1q_s16(d, d0); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); } } -static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, - const int8x8_t filters, - const uint8x16_t permute_tbl, - const int32x4_t horiz_const) { - // Permute samples ready for dot product. - // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); +void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + if (w == 2 || h == 2) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, + subpel_y_qn); + return; + } - // First 4 output values. - int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0); + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); - // We halved the convolution filter values so -1 from the right shift. - return vshrn_n_s32(sum, ROUND0_BITS - 1); + if (y_filter_taps <= 6) { + av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + return; + } + + const int vert_offset = y_filter_taps / 2 - 1; + src -= vert_offset * src_stride; + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (y_filter_taps > 8) { + convolve_y_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); + return; + } + convolve_y_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); } static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, const int8x8_t filters, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { - uint8x16_t permuted_samples[3]; - int32x4_t sum[2]; - // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; - // First 4 output values. - sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0); - sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1); - // Second 4 output values. - sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0); - sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1); + int32x4_t sum0123 = + vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); + sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1); + + int32x4_t sum4567 = + vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0); + sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filters, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. - return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), - vshrn_n_s32(sum[1], ROUND0_BITS - 1)); + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } -static INLINE void convolve_2d_sr_horiz_neon_i8mm( +static INLINE void convolve_2d_sr_horiz_8tap_neon_i8mm( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + const int bd = 8; // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. @@ -562,89 +815,371 @@ int dst_stride = im_stride; int height = im_h; - if (w <= 4) { - const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); - // 4-tap filters are used for blocks having width <= 4. - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = - vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); - - src_ptr += 2; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; do { uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); - int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const); - int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const); - int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); - store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE int16x4_t convolve4_4_2d_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); + + int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve4_8_2d_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + int32x4_t sum0123 = + vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); + int32x4_t sum4567 = + vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0); + + // Narrow and re-pack. + // We halved the filter values so -1 from right shift. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), + vshrn_n_s32(sum4567, ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_4tap_neon_i8mm( + const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width, + int height, const int16_t *filter_x) { + const int bd = 8; + const int16x4_t x_filter = vld1_s16(filter_x + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we halved the filter values. + const int32x4_t horiz_const = vdupq_n_s32( + (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2)); + + if (width == 4) { + const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl); + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const); + int16x4_t d1 = convolve4_4_2d_h(s1, filter, perm_tbl, horiz_const); + int16x4_t d2 = convolve4_4_2d_h(s2, filter, perm_tbl, horiz_const); + int16x4_t d3 = convolve4_4_2d_h(s3, filter, perm_tbl, horiz_const); + + store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); + + src += 4 * src_stride; + dst += 4 * dst_stride; height -= 4; } while (height > 4); do { - uint8x16_t s0 = vld1q_u8(src_ptr); - int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); - vst1_s16(dst_ptr, d0); + uint8x16_t s0 = vld1q_u8(src); + int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const); + vst1_s16(dst, d0); - src_ptr += src_stride; - dst_ptr += dst_stride; + src += src_stride; + dst += dst_stride; } while (--height != 0); } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - // Filter values are even, so halve to reduce intermediate precision reqs. - const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); - + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; + int w = width; + const uint8_t *s = src; + int16_t *d = dst; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); - int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); - int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); - int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const); + int16x8_t d1 = convolve4_8_2d_h(s1, filter, perm_tbl, horiz_const); + int16x8_t d2 = convolve4_8_2d_h(s2, filter, perm_tbl, horiz_const); + int16x8_t d3 = convolve4_8_2d_h(s3, filter, perm_tbl, horiz_const); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; - width -= 8; - } while (width != 0); - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; height -= 4; } while (height > 4); do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; + const uint8_t *s = src; + int16_t *d = dst; + int w = width; do { uint8x16_t s0 = vld1q_u8(s); - int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; + w -= 8; + } while (w != 0); + src += src_stride; + dst += dst_stride; } while (--height != 0); } } +static INLINE void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src, + int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter_ptr, + const int16_t *y_filter_ptr) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + const int bd = 8; + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The outermost -1 is needed because we halved the filter values. + const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; + load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); + s += 5 * src_stride; + + int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const); + int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const); + int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const); + int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const); + int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const); + + do { + uint8x16_t h_s5, h_s6, h_s7, h_s8; + load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); + + int16x8_t v_s5 = + convolve8_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const); + int16x8_t v_s6 = + convolve8_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const); + int16x8_t v_s7 = + convolve8_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const); + int16x8_t v_s8 = + convolve8_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const); + + uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + y_filter, vert_const); + uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + y_filter, vert_const); + uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + y_filter, vert_const); + uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + y_filter, vert_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void convolve_2d_sr_4tap_neon_i8mm(const uint8_t *src, + int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter_ptr, + const int16_t *y_filter_ptr) { + const int bd = 8; + const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); + + const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); + const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2); + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1); + + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we halved the filter values. + const int32x4_t horiz_const = vdupq_n_s32( + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2); + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const); + int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const); + int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t v_s3 = + convolve4_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const); + int16x4_t v_s4 = + convolve4_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const); + int16x4_t v_s5 = + convolve4_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const); + int16x4_t v_s6 = + convolve4_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const); + + int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter); + int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter); + int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter); + int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); + + do { + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x8_t v_s0 = + convolve4_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const); + int16x8_t v_s1 = + convolve4_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const); + int16x8_t v_s2 = + convolve4_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x8_t v_s3 = + convolve4_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const); + int16x8_t v_s4 = + convolve4_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const); + int16x8_t v_s5 = + convolve4_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const); + int16x8_t v_s6 = + convolve4_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const); + + uint8x8_t d0 = + convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const); + uint8x8_t d1 = + convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const); + uint8x8_t d2 = + convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); + uint8x8_t d3 = + convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, @@ -659,7 +1194,8 @@ } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); - const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; @@ -690,12 +1226,32 @@ DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w, - im_h, x_filter_ptr); + if (y_filter_taps == 6 && x_filter_taps >= 6) { + convolve_2d_sr_6tap_neon_i8mm(src_ptr, src_stride, dst, dst_stride, w, h, + x_filter_ptr, y_filter_ptr); + return; + } + + if (y_filter_taps <= 4 && x_filter_taps <= 4) { + convolve_2d_sr_4tap_neon_i8mm(src_ptr + 2, src_stride, dst, dst_stride, w, + h, x_filter_ptr, y_filter_ptr); + return; + } + + if (x_filter_taps <= 4) { + convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr); + } else { + convolve_2d_sr_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr); + } const int16x8_t y_filter = vld1q_s16(y_filter_ptr); - if (clamped_y_taps <= 6) { + if (clamped_y_taps <= 4) { + convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_ptr); + } else if (clamped_y_taps == 6) { convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } else {
diff --git a/av1/common/arm/convolve_neon_i8mm.h b/av1/common/arm/convolve_neon_i8mm.h new file mode 100644 index 0000000..15a8a4e --- /dev/null +++ b/av1/common/arm/convolve_neon_i8mm.h
@@ -0,0 +1,183 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, + const int8x16_t filters, + const uint8x16x3_t permute_tbl, + int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; + + int32x4_t sum = vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0); + sum = vusdotq_laneq_s32(sum, perm_samples[1], filters, 1); + sum = vusdotq_laneq_s32(sum, perm_samples[2], filters, 2); + + // Narrow and re-pack. + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], + const int8x16_t filters, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), + vqtbl1q_u8(samples[0], permute_tbl.val[1]), + vqtbl1q_u8(samples[0], permute_tbl.val[2]), + vqtbl1q_u8(samples[1], permute_tbl.val[2]) }; + + int32x4_t sum0123 = + vusdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0); + sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[1], filters, 1); + sum0123 = vusdotq_laneq_s32(sum0123, perm_samples[2], filters, 2); + + int32x4_t sum4567 = + vusdotq_laneq_s32(horiz_const, perm_samples[1], filters, 0); + sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[2], filters, 1); + sum4567 = vusdotq_laneq_s32(sum4567, perm_samples[3], filters, 2); + + // Narrow and re-pack. + return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS), + vshrn_n_s32(sum4567, ROUND0_BITS)); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + // The no-op filter should never be used here. + assert(vgetq_lane_s16(x_filter_0_7, 5) != 128); + + const int bd = 8; + + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_const = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = + convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = + convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = + convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_
diff --git a/av1/common/arm/convolve_scale_neon.h b/av1/common/arm/convolve_scale_neon.h new file mode 100644 index 0000000..2253b54 --- /dev/null +++ b/av1/common/arm/convolve_scale_neon.h
@@ -0,0 +1,921 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_ + +#include <assert.h> +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" + +static INLINE int16x4_t compound_convolve8_4_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, + const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = offset_const; + sum = vmlal_lane_s16(sum, s0, filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); + + return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE int16x8_t compound_convolve8_8_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, + const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = offset_const; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); + + int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS); + int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS); + + return vcombine_s16(res0, res1); +} + +static INLINE void compound_convolve_vert_scale_8tap_neon( + const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use + // non-rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. + const int32x4_t vert_offset = + vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); + + int y_qn = subpel_y_qn; + + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, + filter, vert_offset); + + vst1_u16(dst, vreinterpret_u16_s16(d0)); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int width = w; + uint16_t *d = dst; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, + filter, vert_offset); + + vst1q_u16(d, vreinterpretq_u16_s16(d0)); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE void compound_avg_convolve_vert_scale_8tap_neon( + const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, + uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, + int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use + // non-rounding shifts - which are generally faster than rounding shifts + // on modern CPUs. + const int32_t vert_offset_bits = + (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)); + // For the averaging code path substract round offset and convolve round. + const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits); + const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits); + + int y_qn = subpel_y_qn; + + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, + filter, vert_offset); + + int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); + + int16x4_t avg = vhadd_s16(dd0, d0); + int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0)); + + uint8x8_t d0_u8 = vqrshrun_n_s16( + d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + + store_u8_4x1(dst8, d0_u8); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int width = w; + uint8_t *dst8_ptr = dst8; + uint16_t *dst16_ptr = dst16; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, + filter, vert_offset); + + int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); + + int16x8_t avg = vhaddq_s16(dd0, d0); + + uint8x8_t d0_u8 = vqrshrun_n_s16( + avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + + vst1_u8(dst8_ptr, d0_u8); + + s += 8; + dst8_ptr += 8; + dst16_ptr += 8; + width -= 8; + } while (width != 0); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE void compound_dist_wtd_convolve_vert_scale_8tap_neon( + const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, + uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, + ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + int y_qn = subpel_y_qn; + // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use + // non-rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. + const int32x4_t vert_offset = + vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); + // For the weighted averaging code path we have to substract round offset and + // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS - + // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The + // additional shift by DIST_PRECISION_BITS is needed in order to merge two + // shift calculations into one. + const int32x4_t dist_wtd_offset = vdupq_n_s32( + (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 + + DIST_PRECISION_BITS)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS))); + const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset); + const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset); + + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, + filter, vert_offset); + + int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); + + int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0); + dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0); + + int16x4_t d0_s16 = vshrn_n_s32( + dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + + DIST_PRECISION_BITS); + + uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0))); + + store_u8_4x1(dst8, d0_u8); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int width = w; + uint8_t *dst8_ptr = dst8; + uint16_t *dst16_ptr = dst16; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, + filter, vert_offset); + + int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); + + int32x4_t dst_wtd_avg0 = + vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0)); + int32x4_t dst_wtd_avg1 = + vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0)); + + dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0)); + dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0)); + + int16x4_t d0_s16_0 = vshrn_n_s32( + dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + + DIST_PRECISION_BITS); + int16x4_t d0_s16_1 = vshrn_n_s32( + dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + + DIST_PRECISION_BITS); + + uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1)); + + vst1_u8(dst8_ptr, d0_u8); + + s += 8; + dst8_ptr += 8; + dst16_ptr += 8; + width -= 8; + } while (width != 0); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter, + const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = offset_const; + sum = vmlal_lane_s16(sum, s0, filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); + + int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0))); +} + +static INLINE uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, + const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = offset_const; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); + + int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS); + int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res0, res1)); +} + +static INLINE void convolve_vert_scale_8tap_neon( + const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int round_1 = 2 * FILTER_BITS - ROUND0_BITS; + // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts. + int32x4_t vert_offset = + vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1))); + + int y_qn = subpel_y_qn; + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + uint8x8_t d = + convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); + + store_u8_4x1(dst, d); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else if (w == 8) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + uint8x8_t d = + convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); + + vst1_u8(dst, d); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + uint8_t *d = dst; + int width = w; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + do { + int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], + &s5[0], &s6[0], &s7[0]); + load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], + &s5[1], &s6[1], &s7[1]); + + uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], + s6[0], s7[0], filter, vert_offset); + uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], + s6[1], s7[1], filter, vert_offset); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE int16x4_t compound_convolve6_4_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = offset_const; + // Filter values at indices 0 and 7 are 0. + sum = vmlal_lane_s16(sum, s0, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); + + return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE int16x8_t compound_convolve6_8_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = offset_const; + // Filter values at indices 0 and 7 are 0. + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); + + int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS); + int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS); + + return vcombine_s16(res0, res1); +} + +static INLINE void compound_convolve_vert_scale_6tap_neon( + const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use + // non-rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. + const int32x4_t vert_offset = + vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); + + int y_qn = subpel_y_qn; + + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5; + load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + int16x4_t d0 = + compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + vst1_u16(dst, vreinterpret_u16_s16(d0)); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int width = w; + uint16_t *d = dst; + + do { + int16x8_t s0, s1, s2, s3, s4, s5; + load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + int16x8_t d0 = + compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + vst1q_u16(d, vreinterpretq_u16_s16(d0)); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE void compound_avg_convolve_vert_scale_6tap_neon( + const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, + uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, + int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use + // non-rounding shifts - which are generally faster than rounding shifts + // on modern CPUs. + const int32_t vert_offset_bits = + (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)); + // For the averaging code path substract round offset and convolve round. + const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits); + const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits); + + int y_qn = subpel_y_qn; + + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5; + load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + int16x4_t d0 = + compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); + + int16x4_t avg = vhadd_s16(dd0, d0); + int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0)); + + uint8x8_t d0_u8 = vqrshrun_n_s16( + d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + + store_u8_4x1(dst8, d0_u8); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int width = w; + uint8_t *dst8_ptr = dst8; + uint16_t *dst16_ptr = dst16; + + do { + int16x8_t s0, s1, s2, s3, s4, s5; + load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + int16x8_t d0 = + compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); + + int16x8_t avg = vhaddq_s16(dd0, d0); + + uint8x8_t d0_u8 = vqrshrun_n_s16( + avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + + vst1_u8(dst8_ptr, d0_u8); + + s += 8; + dst8_ptr += 8; + dst16_ptr += 8; + width -= 8; + } while (width != 0); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE void compound_dist_wtd_convolve_vert_scale_6tap_neon( + const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, + uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, + ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + int y_qn = subpel_y_qn; + // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use + // non-rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. + const int32x4_t vert_offset = + vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); + // For the weighted averaging code path we have to substract round offset and + // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS - + // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The + // additional shift by DIST_PRECISION_BITS is needed in order to merge two + // shift calculations into one. + const int32x4_t dist_wtd_offset = vdupq_n_s32( + (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 + + DIST_PRECISION_BITS)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS))); + const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset); + const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset); + + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5; + load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + int16x4_t d0 = + compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); + + int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0); + dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0); + + int16x4_t d0_s16 = vshrn_n_s32( + dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + + DIST_PRECISION_BITS); + + uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0))); + + store_u8_4x1(dst8, d0_u8); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int width = w; + uint8_t *dst8_ptr = dst8; + uint16_t *dst16_ptr = dst16; + + do { + int16x8_t s0, s1, s2, s3, s4, s5; + load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + int16x8_t d0 = + compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); + + int32x4_t dst_wtd_avg0 = + vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0)); + int32x4_t dst_wtd_avg1 = + vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0)); + + dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0)); + dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0)); + + int16x4_t d0_s16_0 = vshrn_n_s32( + dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + + DIST_PRECISION_BITS); + int16x4_t d0_s16_1 = vshrn_n_s32( + dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + + DIST_PRECISION_BITS); + + uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1)); + + vst1_u8(dst8_ptr, d0_u8); + + s += 8; + dst8_ptr += 8; + dst16_ptr += 8; + width -= 8; + } while (width != 0); + + dst16 += dst16_stride; + dst8 += dst8_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +static INLINE uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, + const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = offset_const; + // Filter values at indices 0 and 7 are 0. + sum = vmlal_lane_s16(sum, s0, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); + + int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0))); +} + +static INLINE uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, + const int32x4_t offset_const) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = offset_const; + // Filter values at indices 0 and 7 are 0. + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); + + int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS); + int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res0, res1)); +} + +static INLINE void convolve_vert_scale_6tap_neon( + const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int round_1 = 2 * FILTER_BITS - ROUND0_BITS; + // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts. + int32x4_t vert_offset = + vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1))); + + int y_qn = subpel_y_qn; + if (w == 4) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x4_t s0, s1, s2, s3, s4, s5; + load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + uint8x8_t d = convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + store_u8_4x1(dst, d); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else if (w == 8) { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + int16x8_t s0, s1, s2, s3, s4, s5; + load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); + + uint8x8_t d = convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); + + vst1_u8(dst, d); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } else { + do { + const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + uint8_t *d = dst; + int width = w; + + const ptrdiff_t filter_offset = + SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); + const int16x8_t filter = vld1q_s16(y_filter + filter_offset); + + do { + int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2]; + load_s16_8x6(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], + &s5[0]); + load_s16_8x6(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], + &s5[1]); + + uint8x8_t d0 = convolve6_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], + filter, vert_offset); + uint8x8_t d1 = convolve6_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], + filter, vert_offset); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); + + dst += dst_stride; + y_qn += y_step_qn; + } while (--h != 0); + } +} + +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_
diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c new file mode 100644 index 0000000..a274730 --- /dev/null +++ b/av1/common/arm/convolve_sve2.c
@@ -0,0 +1,203 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_filter.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/highbd_convolve_sve2.h" +#include "av1/common/arm/convolve_neon_i8mm.h" + +static INLINE int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2], + int16x8_t s1[2], + int16x8_t s2[2], + int16x8_t filter_0_7, + int16x8_t filter_4_11) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1); + + return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); +} + +static INLINE void convolve_2d_sr_vert_12tap_sve2( + const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t y_filter_0_7, + const int16x8_t y_filter_4_11) { + // The no-op filter should never be used here. + assert(vgetq_lane_s16(y_filter_0_7, 5) != 128); + + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + do { + int16_t *s = (int16_t *)src_ptr; + uint8_t *d = (uint8_t *)dst_ptr; + int height = h; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_4x4(s4, s5, s6, s7, s4567); + transpose_concat_4x4(s5, s6, s7, s8, s5678); + transpose_concat_4x4(s6, s7, s8, s9, s6789); + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_4x4(sB, sC, sD, sE, sBCDE); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); + + int32x4_t d0 = highbd_convolve12_4_2d_v(s0123, s4567, s89AB, y_filter_0_7, + y_filter_4_11); + int32x4_t d1 = highbd_convolve12_4_2d_v(s1234, s5678, s9ABC, y_filter_0_7, + y_filter_4_11); + int32x4_t d2 = highbd_convolve12_4_2d_v(s2345, s6789, sABCD, y_filter_0_7, + y_filter_4_11); + int32x4_t d3 = highbd_convolve12_4_2d_v(s3456, s789A, sBCDE, y_filter_0_7, + y_filter_4_11); + + int16x8_t dd01 = + vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS)); + int16x8_t dd23 = + vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS)); + + dd01 = vsubq_s16(dd01, sub_const); + dd23 = vsubq_s16(dd23, sub_const); + + uint8x8_t d01 = vqmovun_s16(dd01); + uint8x8_t d23 = vqmovun_s16(dd23); + + store_u8x4_strided_x2(d + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + w -= 4; + } while (w != 0); +} + +void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + if (filter_params_x->taps > 8) { + const int im_h = h + filter_params_y->taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = filter_params_x->taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_0_7, + x_filter_8_11); + + convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_4_11); + } else { + av1_convolve_2d_sr_neon_i8mm(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } +}
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c index dc3f876..9247ded 100644 --- a/av1/common/arm/highbd_compound_convolve_neon.c +++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -20,266 +20,9 @@ #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" +#include "av1/common/arm/highbd_compound_convolve_neon.h" #include "av1/common/arm/highbd_convolve_neon.h" -#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - -static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr, - int src_stride, uint16_t *dst_ptr, - int dst_stride, int w, int h, - ConvolveParams *conv_params, - const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint16x4_t offset_vec = vdup_n_u16(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint16x4_t avg = vhadd_u16(src, ref); - int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint16x8_t avg = vhaddq_u16(s, r); - int32x4_t d0_lo = - vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); - int32x4_t d0_hi = - vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); - - uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), - vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); - d0 = vminq_u16(d0, max); - vst1q_u16(dst, d0); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } -} - -static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, - uint16_t *dst_ptr, int dst_stride, - int w, int h, - ConvolveParams *conv_params, - const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint16x4_t offset_vec = vdup_n_u16(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint16x4_t avg = vhadd_u16(src, ref); - int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint16x8_t avg = vhaddq_u16(s, r); - int32x4_t d0_lo = - vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); - int32x4_t d0_hi = - vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); - - uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), - vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); - d0 = vminq_u16(d0, max); - vst1q_u16(dst, d0); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } -} - -static INLINE void highbd_12_dist_wtd_comp_avg_neon( - const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, - int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint32x4_t offset_vec = vdupq_n_u32(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); - uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); - - // Weighted averaging - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); - wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); - wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); - wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); - wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); - - uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); - wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); - wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); - int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); - - uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), - vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); - d01 = vminq_u16(d01, max); - vst1q_u16(dst, d01); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } -} - -static INLINE void highbd_dist_wtd_comp_avg_neon( - const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, - int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint32x4_t offset_vec = vdupq_n_u32(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); - uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); - - // Weighted averaging - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); - wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); - wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); - wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); - wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); - - uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); - wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); - wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); - int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); - - uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), - vqrshrun_n_s32(d1, ROUND_SHIFT)); - d01 = vminq_u16(d01, max); - vst1q_u16(dst, d01); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } -} - static INLINE uint16x4_t highbd_12_convolve6_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, @@ -743,9 +486,6 @@ const int im_stride = MAX_SB_SIZE; const int horiz_offset = filter_params_x->taps / 2 - 1; assert(FILTER_BITS == COMPOUND_ROUND1_BITS); - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int offset_avg = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); const int offset_convolve = (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)); @@ -768,10 +508,10 @@ } if (conv_params->use_dist_wtd_comp_avg) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, - w, h, conv_params, offset_avg, bd); + w, h, conv_params); } else { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, - conv_params, offset_avg, bd); + conv_params); } } else { if (x_filter_taps <= 6 && w != 4) { @@ -795,10 +535,10 @@ } if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, - h, conv_params, offset_avg, bd); + h, conv_params, bd); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, - conv_params, offset_avg, bd); + conv_params, bd); } } else { if (x_filter_taps <= 6 && w != 4) { @@ -971,6 +711,212 @@ } } +static INLINE uint16x4_t highbd_12_convolve4_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); + sum = vmlal_lane_s16(sum, s1, filter, 1); + sum = vmlal_lane_s16(sum, s2, filter, 2); + sum = vmlal_lane_s16(sum, s3, filter, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS + 2); +} + +static INLINE uint16x8_t highbd_12_convolve4_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), + vqshrun_n_s32(sum1, ROUND0_BITS + 2)); +} + +static INLINE void highbd_12_dist_wtd_convolve_y_4tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = + highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec); + uint16x4_t d1 = + highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec); + uint16x4_t d2 = + highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d3 = + highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = + highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); + sum = vmlal_lane_s16(sum, s1, filter, 1); + sum = vmlal_lane_s16(sum, s2, filter, 2); + sum = vmlal_lane_s16(sum, s3, filter, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_convolve4_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), + vqshrun_n_s32(sum1, ROUND0_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_y_4tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec); + uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec); + uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = + highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { @@ -1148,9 +1094,6 @@ const int im_stride = MAX_SB_SIZE; const int vert_offset = filter_params_y->taps / 2 - 1; assert(FILTER_BITS == COMPOUND_ROUND1_BITS); - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); const int round_offset_conv = (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)); @@ -1162,7 +1105,11 @@ if (bd == 12) { if (conv_params->do_average) { - if (y_filter_taps <= 6) { + if (y_filter_taps <= 4) { + highbd_12_dist_wtd_convolve_y_4tap_neon( + src + 2 * src_stride, src_stride, im_block, im_stride, w, h, + y_filter_ptr, round_offset_conv); + } else if (y_filter_taps == 6) { highbd_12_dist_wtd_convolve_y_6tap_neon( src + src_stride, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); @@ -1173,14 +1120,17 @@ } if (conv_params->use_dist_wtd_comp_avg) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, - w, h, conv_params, round_offset_avg, - bd); + w, h, conv_params); } else { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, - conv_params, round_offset_avg, bd); + conv_params); } } else { - if (y_filter_taps <= 6) { + if (y_filter_taps <= 4) { + highbd_12_dist_wtd_convolve_y_4tap_neon( + src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h, + y_filter_ptr, round_offset_conv); + } else if (y_filter_taps == 6) { highbd_12_dist_wtd_convolve_y_6tap_neon( src + src_stride, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); @@ -1192,7 +1142,11 @@ } } else { if (conv_params->do_average) { - if (y_filter_taps <= 6) { + if (y_filter_taps <= 4) { + highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride, + im_block, im_stride, w, h, + y_filter_ptr, round_offset_conv); + } else if (y_filter_taps == 6) { highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); @@ -1203,13 +1157,17 @@ } if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, - h, conv_params, round_offset_avg, bd); + h, conv_params, bd); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, - conv_params, round_offset_avg, bd); + conv_params, bd); } } else { - if (y_filter_taps <= 6) { + if (y_filter_taps <= 4) { + highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride, + dst16, dst16_stride, w, h, + y_filter_ptr, round_offset_conv); + } else if (y_filter_taps == 6) { highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); @@ -1235,7 +1193,7 @@ uint16x4_t d = vshl_u16(s, round_shift_s16); d = vadd_u16(d, offset_u16); if (w == 2) { - store_u16_2x1(dst_ptr + y * dst_stride, d, 0); + store_u16_2x1(dst_ptr + y * dst_stride, d); } else { vst1_u16(dst_ptr + y * dst_stride, d); } @@ -1285,18 +1243,18 @@ if (conv_params->use_dist_wtd_comp_avg) { if (bd == 12) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, - w, h, conv_params, round_offset, bd); + w, h, conv_params); } else { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, - h, conv_params, round_offset, bd); + h, conv_params, bd); } } else { if (bd == 12) { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, - conv_params, round_offset, bd); + conv_params); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, - conv_params, round_offset, bd); + conv_params, bd); } } } @@ -1949,9 +1907,6 @@ (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1)); const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset_conv_y = (1 << y_offset_bits); - const int round_offset_avg = - ((1 << (y_offset_bits - conv_params->round_1)) + - (1 << (y_offset_bits - conv_params->round_1 - 1))); const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; @@ -2012,19 +1967,18 @@ if (conv_params->use_dist_wtd_comp_avg) { if (bd == 12) { highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, - w, h, conv_params, round_offset_avg, - bd); + w, h, conv_params); } else { highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, - h, conv_params, round_offset_avg, bd); + h, conv_params, bd); } } else { if (bd == 12) { highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, - conv_params, round_offset_avg, bd); + conv_params); } else { highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, - conv_params, round_offset_avg, bd); + conv_params, bd); } } }
diff --git a/av1/common/arm/highbd_compound_convolve_neon.h b/av1/common/arm/highbd_compound_convolve_neon.h new file mode 100644 index 0000000..c9344f3 --- /dev/null +++ b/av1/common/arm/highbd_compound_convolve_neon.h
@@ -0,0 +1,293 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + +static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr, + int src_stride, uint16_t *dst_ptr, + int dst_stride, int w, int h, + ConvolveParams *conv_params) { + const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2; + const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16((uint16_t)offset); + const uint16x8_t max = vdupq_n_u16((1 << 12) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + const int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16((uint16_t)offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_12_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params) { + const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2; + const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << 12) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), + vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), + vqrshrun_n_s32(d1, ROUND_SHIFT)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +}
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c new file mode 100644 index 0000000..1d6c9b4 --- /dev/null +++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -0,0 +1,1555 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/aom_neon_sve2_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_compound_convolve_neon.h" +#include "av1/common/arm/highbd_convolve_neon.h" +#include "av1/common/arm/highbd_convolve_sve2.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, +}; + +static INLINE uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2), + vqrshrun_n_s32(sum4567, ROUND0_BITS + 2)); +} + +static INLINE void highbd_12_dist_wtd_convolve_x_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr) { + const int64x1_t offset_vec = + vcreate_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); + const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0)); + + const int16x8_t filter = vld1q_s16(x_filter_ptr); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset_lo); + uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset_lo); + uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset_lo); + uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset_lo); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS), + vqrshrun_n_s32(sum4567, ROUND0_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_x_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, const int bd) { + const int64x1_t offset_vec = + vcreate_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); + const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0)); + + const int16x8_t filter = vld1q_s16(x_filter_ptr); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset_lo); + uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset_lo); + uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset_lo); + uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset_lo); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; +// clang-format on + +static INLINE uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter, + int64x2_t offset, + uint16x8x2_t permute_tbl) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2); +} + +static INLINE uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4], + int16x8_t filter, + int64x2_t offset, + uint16x8_t tbl) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS + 2), + vqrshrun_n_s32(sum2637, ROUND0_BITS + 2)); + return aom_tbl_u16(res, tbl); +} + +static INLINE void highbd_12_dist_wtd_convolve_x_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr) { + const int64x2_t offset = + vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); + uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); + uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); + uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); + uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter, + int64x2_t offset, + uint16x8x2_t permute_tbl) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + return vqrshrun_n_s32(sum0123, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter, + int64x2_t offset, + uint16x8_t tbl) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS), + vqrshrun_n_s32(sum2637, ROUND0_BITS)); + return aom_tbl_u16(res, tbl); +} + +static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, const int bd) { + const int64x2_t offset = + vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); + uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); + uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); + uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); + uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); + uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); + uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_dist_wtd_convolve_x_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + + if (x_filter_taps == 6) { + av1_highbd_dist_wtd_convolve_x_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, + conv_params, bd); + return; + } + + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int horiz_offset = filter_params_x->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + if (bd == 12) { + if (conv_params->do_average) { + if (x_filter_taps <= 4) { + highbd_12_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, + im_stride, w, h, x_filter_ptr); + } else { + highbd_12_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block, + im_stride, w, h, x_filter_ptr); + } + + if (conv_params->use_dist_wtd_comp_avg) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params); + + } else { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params); + } + } else { + if (x_filter_taps <= 4) { + highbd_12_dist_wtd_convolve_x_4tap_sve2( + src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr); + } else { + highbd_12_dist_wtd_convolve_x_8tap_sve2( + src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr); + } + } + } else { + if (conv_params->do_average) { + if (x_filter_taps <= 4) { + highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, + im_stride, w, h, x_filter_ptr, bd); + } else { + highbd_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block, + im_stride, w, h, x_filter_ptr, bd); + } + + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, bd); + } + } else { + if (x_filter_taps <= 4) { + highbd_dist_wtd_convolve_x_4tap_sve2( + src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd); + } else { + highbd_dist_wtd_convolve_x_8tap_sve2( + src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd); + } + } + } +} + +static INLINE uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2); +} + +static INLINE uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2), + vqrshrun_n_s32(sum4567, ROUND0_BITS + 2)); +} + +static INLINE void highbd_12_dist_wtd_convolve_y_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr) { + const int64x2_t offset = + vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + int16_t *s = (int16_t *)src; + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = highbd_12_convolve8_4_y(s0123, s4567, y_filter, offset); + uint16x4_t d1 = highbd_12_convolve8_4_y(s1234, s5678, y_filter, offset); + uint16x4_t d2 = highbd_12_convolve8_4_y(s2345, s6789, y_filter, offset); + uint16x4_t d3 = highbd_12_convolve8_4_y(s3456, s789A, y_filter, offset); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = highbd_12_convolve8_8_y(s0123, s4567, y_filter, offset); + uint16x8_t d1 = highbd_12_convolve8_8_y(s1234, s5678, y_filter, offset); + uint16x8_t d2 = highbd_12_convolve8_8_y(s2345, s6789, y_filter, offset); + uint16x8_t d3 = highbd_12_convolve8_8_y(s3456, s789A, y_filter, offset); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + return vqrshrun_n_s32(sum0123, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS), + vqrshrun_n_s32(sum4567, ROUND0_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_y_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, const int bd) { + const int64x2_t offset = + vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + int16_t *s = (int16_t *)src; + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, offset); + uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, offset); + uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, offset); + uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, offset); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, offset); + uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, offset); + uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, offset); + uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, offset); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_highbd_dist_wtd_convolve_y_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + + if (y_filter_taps != 8) { + av1_highbd_dist_wtd_convolve_y_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, + conv_params, bd); + return; + } + + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = filter_params_y->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (bd == 12) { + if (conv_params->do_average) { + highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, + im_stride, w, h, y_filter_ptr); + if (conv_params->use_dist_wtd_comp_avg) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params); + } else { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params); + } + } else { + highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, + dst16_stride, w, h, y_filter_ptr); + } + } else { + if (conv_params->do_average) { + highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride, + w, h, y_filter_ptr, bd); + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, bd); + } + } else { + highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride, + w, h, y_filter_ptr, bd); + } + } +} + +static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr) { + const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 2)); + const int16x8_t filter = vld1q_s16(x_filter_ptr); + + // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know + // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at + // a time and then process the last 3 rows separately. + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset); + uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset); + uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset); + uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 4); + + // Process final 3 rows. + const int16_t *s = (const int16_t *)src; + do { + int16x8_t s0[8], s1[8], s2[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], + &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], + &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], + &s2[5], &s2[6], &s2[7]); + + uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset); + uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset); + uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset); + + store_u16_8x3(dst, dst_stride, d0, d1, d2); + s += 8; + dst += 8; + width -= 8; + } while (width != 0); +} + +static INLINE void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, const int bd) { + const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 2)); + const int16x8_t filter = vld1q_s16(x_filter_ptr); + + // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know + // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at + // a time and then process the last 3 rows separately. + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset); + uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset); + uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset); + uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 4); + + // Process final 3 rows. + const int16_t *s = (const int16_t *)src; + do { + int16x8_t s0[8], s1[8], s2[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], + &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], + &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], + &s2[5], &s2[6], &s2[7]); + + uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset); + uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset); + uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset); + + store_u16_8x3(dst, dst_stride, d0, d1, d2); + s += 8; + dst += 8; + width -= 8; + } while (width != 0); +} + +static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr) { + const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 1)); + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know + // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at + // a time and then process the last 3 rows separately. + + if (width == 4) { + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); + uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 4); + + // Process final 3 rows. + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); + + store_u16_4x3(dst, dst_stride, d0, d1, d2); + + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); + uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); + uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); + uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 4); + + // Process final 3 rows. + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0[4], s1[4], s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + + uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); + uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); + uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); + + store_u16_8x3(dst, dst_stride, d0, d1, d2); + + s += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, const int bd) { + const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 1)); + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know + // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at + // a time and then process the last 3 rows separately. + + if (width == 4) { + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); + uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); + uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); + uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 4); + + // Process final 3 rows. + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); + uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); + uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); + + store_u16_4x3(dst, dst_stride, d0, d1, d2); + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); + uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); + uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); + uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 4); + + // Process final 3 rows. + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0[4], s1[4], s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + + uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); + uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); + uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); + + store_u16_8x3(dst, dst_stride, d0, d1, d2); + + s += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + return vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + int64x2_t offset) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + return vcombine_u16(vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum4567, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int64x2_t offset_s64 = vdupq_n_s64(offset); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + int16_t *s = (int16_t *)src; + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = + highbd_convolve8_4_2d_v(s0123, s4567, y_filter, offset_s64); + uint16x4_t d1 = + highbd_convolve8_4_2d_v(s1234, s5678, y_filter, offset_s64); + uint16x4_t d2 = + highbd_convolve8_4_2d_v(s2345, s6789, y_filter, offset_s64); + uint16x4_t d3 = + highbd_convolve8_4_2d_v(s3456, s789A, y_filter, offset_s64); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = + highbd_convolve8_8_2d_v(s0123, s4567, y_filter, offset_s64); + uint16x8_t d1 = + highbd_convolve8_8_2d_v(s1234, s5678, y_filter, offset_s64); + uint16x8_t d2 = + highbd_convolve8_8_2d_v(s2345, s6789, y_filter, offset_s64); + uint16x8_t d3 = + highbd_convolve8_8_2d_v(s3456, s789A, y_filter, offset_s64); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); + sum = vmlal_lane_s16(sum, s1, filter, 1); + sum = vmlal_lane_s16(sum, s2, filter, 2); + sum = vmlal_lane_s16(sum, s3, filter, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve4_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_2d_vert_4tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = + highbd_convolve4_4_2d_v(s0, s1, s2, s3, y_filter, offset_vec); + uint16x4_t d1 = + highbd_convolve4_4_2d_v(s1, s2, s3, s4, y_filter, offset_vec); + uint16x4_t d2 = + highbd_convolve4_4_2d_v(s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d3 = + highbd_convolve4_4_2d_v(s3, s4, s5, s6, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = + highbd_convolve4_8_2d_v(s0, s1, s2, s3, y_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve4_8_2d_v(s1, s2, s3, s4, y_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve4_8_2d_v(s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve4_8_2d_v(s3, s4, s5, s6, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_highbd_dist_wtd_convolve_2d_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint16_t, + im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; + + if (x_filter_taps == 6 || y_filter_taps == 6) { + av1_highbd_dist_wtd_convolve_2d_neon( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset_conv_y = (1 << y_offset_bits); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (bd == 12) { + if (x_filter_taps <= 4) { + highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); + } else { + highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); + } + } else { + if (x_filter_taps <= 4) { + highbd_dist_wtd_convolve_2d_horiz_4tap_sve2( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd); + } else { + highbd_dist_wtd_convolve_2d_horiz_8tap_sve2( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd); + } + } + + if (conv_params->do_average) { + if (y_filter_taps <= 4) { + highbd_dist_wtd_convolve_2d_vert_4tap_neon(im_block, im_stride, im_block2, + im_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } else { + highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, im_block2, + im_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } + if (conv_params->use_dist_wtd_comp_avg) { + if (bd == 12) { + highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, + w, h, conv_params); + + } else { + highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, + h, conv_params, bd); + } + } else { + if (bd == 12) { + highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params); + + } else { + highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, bd); + } + } + } else { + if (y_filter_taps <= 4) { + highbd_dist_wtd_convolve_2d_vert_4tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } else { + highbd_dist_wtd_convolve_2d_vert_8tap_sve2( + im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } + } +}
diff --git a/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/av1/common/arm/highbd_convolve_horiz_rs_neon.c index 51da025..4f1c25d 100644 --- a/av1/common/arm/highbd_convolve_horiz_rs_neon.c +++ b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -142,9 +142,9 @@ d0 = vmin_u16(d0, max); if (w == 2) { - store_u16_2x1(d + 0 * dst_stride, d0, 0); + store_u16_2x1(d, d0); } else { - vst1_u16(d + 0 * dst_stride, d0); + vst1_u16(d, d0); } src_ptr += src_stride;
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c index 3f5ff9e..3a3e33f 100644 --- a/av1/common/arm/highbd_convolve_neon.c +++ b/av1/common/arm/highbd_convolve_neon.c
@@ -1927,7 +1927,7 @@ uint16x4_t d0 = vrhadd_u16(s0, s1); if (w == 2) { - store_u16_2x1(dst, d0, 0); + store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } @@ -1978,7 +1978,7 @@ uint16x4_t d0 = vrhadd_u16(s0, s1); if (w == 2) { - store_u16_2x1(dst, d0, 0); + store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } @@ -2086,7 +2086,7 @@ d0 = vhadd_u16(d0, vget_low_u16(vert_offset)); if (w == 2) { - store_u16_2x1(dst, d0, 0); + store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); }
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c index eee5a1c..702c651 100644 --- a/av1/common/arm/highbd_convolve_scale_neon.c +++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -51,7 +51,7 @@ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); if (w == 2) { - store_u16_2x1(dst_ptr, d0_u16, 0); + store_u16_2x1(dst_ptr, d0_u16); } else { vst1_u16(dst_ptr, d0_u16); } @@ -123,7 +123,7 @@ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); if (w == 2) { - store_u16_2x1(dst_ptr, d0_u16, 0); + store_u16_2x1(dst_ptr, d0_u16); } else { vst1_u16(dst_ptr, d0_u16); } @@ -260,9 +260,9 @@ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); if (w == 2) { - store_u16_2x1(d + 0 * dst_stride, d0, 0); + store_u16_2x1(d, d0); } else { - vst1_u16(d + 0 * dst_stride, d0); + vst1_u16(d, d0); } src_ptr += src_stride; @@ -398,7 +398,7 @@ offset_s32, vdupq_n_s32(0)); if (w == 2) { - store_u16_2x1(d, d0, 0); + store_u16_2x1(d, d0); } else { vst1_u16(d, d0); } @@ -458,7 +458,7 @@ uint16x4_t d = vqmovun_s32(d0); d = vmin_u16(d, vget_low_u16(max)); if (w == 2) { - store_u16_2x1(dst_ptr + y * dst_stride, d, 0); + store_u16_2x1(dst_ptr + y * dst_stride, d); } else { vst1_u16(dst_ptr + y * dst_stride, d); }
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c new file mode 100644 index 0000000..6ce9f36 --- /dev/null +++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -0,0 +1,1718 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/aom_neon_sve2_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_sve2.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, +}; + +static INLINE uint16x4_t convolve12_4_x( + int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, + const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) { + int16x8_t permuted_samples[6]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t filter_0_7, + int16x8_t filter_4_11, int64x2_t offset, + uint16x8x4_t permute_tbl, + uint16x8_t max) { + int16x8_t permuted_samples[8]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); + permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int64x2_t sum45 = + aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); + + int64x2_t sum67 = + aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_x_sr_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, int bd) { + // This shim allows to do only one rounding shift instead of two. + const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); + + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( + vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); + permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); + + uint16x8_t correction1 = vreinterpretq_u16_u64( + vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), + vdup_n_u64(svcnth() * 0x0001000100010000ULL))); + permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); + load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); + + uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; + load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); + load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); + load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); + + uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7, + y_filter_4_11, offset, permute_tbl, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, uint16x8_t max) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_x_sr_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int64_t offset = 1 << (conv_params->round_0 - 1); + const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0)); + + const int16x8_t filter = vld1q_s16(y_filter_ptr); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max); + uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max); + uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max); + uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; +// clang-format on + +static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter, + int64x2_t offset, + uint16x8x2_t permute_tbl, + uint16x4_t max) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter, + int64x2_t offset, uint16x8_t tbl, + uint16x8_t max) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS), + vqrshrun_n_s32(sum2637, FILTER_BITS)); + res = aom_tbl_u16(res, tbl); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_x_sr_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, + ConvolveParams *conv_params, int bd) { + // This shim allows to do only one rounding shift instead of two. + const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max); + uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max); + uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max); + uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max); + uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max); + uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max); + uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, bd); + return; + } + + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + + if (x_filter_taps == 6) { + av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, + bd); + return; + } + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + if (x_filter_taps == 12) { + highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + + if (x_filter_taps == 8) { + highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + + highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); +} + +static INLINE uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2], + int16x8_t s2[2], + int16x8_t filter_0_7, + int16x8_t filter_4_11, + uint16x4_t max) { + int64x2_t sum[2]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); + sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1); + sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); + sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1); + sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE void highbd_convolve_y_sr_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, int bd) { + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + do { + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + int h = height; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_4x4(s4, s5, s6, s7, s4567); + transpose_concat_4x4(s5, s6, s7, s8, s5678); + transpose_concat_4x4(s6, s7, s8, s9, s6789); + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_4x4(sB, sC, sD, sE, sBCDE); + + // Use the above transpose and reuse data from the previous loop to get + // the rest. + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); + + uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7, + y_filter_4_11, max); + uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7, + y_filter_4_11, max); + uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7, + y_filter_4_11, max); + uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7, + y_filter_4_11, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 4; + dst += 4; + width -= 4; + } while (width != 0); +} + +static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum01 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int width, + int height, const int16_t *filter_y, + int bd) { + assert(width >= 4 && height >= 4); + + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); + int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0); + int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int width, + int height, const int16_t *filter_y, + int bd) { + assert(width >= 4 && height >= 4); + + const int16x8_t y_filter = + vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max); + uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max); + uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max); + uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample + // permute required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max); + uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max); + uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max); + uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + + if (y_filter_taps == 6) { + av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + + const int vert_offset = filter_params_y->taps / 2 - 1; + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (y_filter_taps > 8) { + highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); + return; + } + + if (y_filter_taps == 4) { + highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_ptr, bd); + return; + } + + highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); +} + +static INLINE uint16x4_t convolve12_4_2d_h( + int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, + const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) { + int16x8_t permuted_samples[6]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vqrshlq_s32(sum0123, shift); + return vqmovun_s32(sum0123); +} + +static INLINE uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t filter_0_7, + int16x8_t filter_4_11, + int64x2_t offset, int32x4_t shift, + uint16x8x4_t permute_tbl) { + int16x8_t permuted_samples[8]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); + permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int64x2_t sum45 = + aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); + + int64x2_t sum67 = + aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + sum0123 = vqrshlq_s32(sum0123, shift); + sum4567 = vqrshlq_s32(sum4567, shift); + + return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, const int x_offset) { + const int64x2_t offset = vdupq_n_s64(x_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( + vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); + permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); + + uint16x8_t correction1 = vreinterpretq_u16_u64( + vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), + vdup_n_u64(svcnth() * 0x0001000100010000ULL))); + permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); + + if (width == 4) { + const int16_t *s = (const int16_t *)src; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); + load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); + + uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + s += 4 * src_stride; + height -= 4; + } while (height > 0); + } else { + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; + load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); + load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); + load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); + + uint16x8_t d0 = + convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + uint16x8_t d1 = + convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + uint16x8_t d2 = + convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + uint16x8_t d3 = + convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +static INLINE uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, int32x4_t shift) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + sum0123 = vqrshlq_s32(sum0123, shift); + sum4567 = vqrshlq_s32(sum4567, shift); + + return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, const int x_offset) { + const int64x2_t offset = vdupq_n_s64(x_offset); + const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0)); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x8_t filter = vld1q_s16(y_filter_ptr); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift); + uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift); + uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift); + uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); +} + +static INLINE uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter, + int64x2_t offset, int32x4_t shift, + uint16x8x2_t permute_tbl) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vqrshlq_s32(sum0123, shift); + return vqmovun_s32(sum0123); +} + +static INLINE uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, int32x4_t shift, + uint16x8_t tbl) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + sum0123 = vqrshlq_s32(sum0123, shift); + sum4567 = vqrshlq_s32(sum4567, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); + return aom_tbl_u16(res, tbl); +} + +static INLINE void highbd_convolve_2d_sr_horiz_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, + ConvolveParams *conv_params, const int x_offset) { + const int64x2_t offset = vdupq_n_s64(x_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + const int16_t *s = (const int16_t *)(src); + + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl); + uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl); + uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl); + uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx); + uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx); + uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx); + uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_2d_v( + int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7, + int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + uint16x4_t res = vqmovun_s32(sum0123); + + return vmin_u16(res, max); +} + +static INLINE void highbd_convolve_2d_sr_vert_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, int bd, const int y_offset) { + const int64x2_t offset = vdupq_n_s64(y_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); + + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + do { + int16_t *s = (int16_t *)src; + uint16_t *d = (uint16_t *)dst; + int h = height; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_4x4(s4, s5, s6, s7, s4567); + transpose_concat_4x4(s5, s6, s7, s8, s5678); + transpose_concat_4x4(s6, s7, s8, s9, s6789); + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_4x4(sB, sC, sD, sE, sBCDE); + + // Use the above transpose and reuse data from the previous loop to get + // the rest. + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); + + uint16x4_t d0 = highbd_convolve12_4_2d_v( + s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max); + uint16x4_t d1 = highbd_convolve12_4_2d_v( + s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max); + uint16x4_t d2 = highbd_convolve12_4_2d_v( + s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max); + uint16x4_t d3 = highbd_convolve12_4_2d_v( + s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 4; + dst += 4; + width -= 4; + } while (width != 0); +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v( + int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, + int32x4_t shift, int64x2_t offset, uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + uint16x4_t res = vqmovun_s32(sum0123); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v( + int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, + int32x4_t shift, int64x2_t offset, uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + sum0123 = vshlq_s32(sum0123, shift); + sum4567 = vshlq_s32(sum4567, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); + return vminq_u16(res, max); +} + +static void highbd_convolve_2d_sr_vert_8tap_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, + ConvolveParams *conv_params, int bd, const int y_offset) { + assert(width >= 4 && height >= 4); + const int64x2_t offset = vdupq_n_s64(y_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = + highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max); + uint16x4_t d1 = + highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max); + uint16x4_t d2 = + highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max); + uint16x4_t d3 = + highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = + highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max); + uint16x8_t d1 = + highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max); + uint16x8_t d2 = + highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max); + uint16x8_t d3 = + highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2], + int16x8_t filter, + int32x4_t shift, + int64x2_t offset, + uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + uint16x4_t res = vqmovun_s32(sum0123); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4], + int16x8_t filter, + int32x4_t shift, + int64x2_t offset, + uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0); + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + sum0123 = vshlq_s32(sum0123, shift); + sum4567 = vshlq_s32(sum4567, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); + return vminq_u16(res, max); +} + +static void highbd_convolve_2d_sr_vert_4tap_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, + ConvolveParams *conv_params, int bd, const int y_offset) { + assert(width >= 4 && height >= 4); + const int64x2_t offset = vdupq_n_s64(y_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); + + const int16x8_t y_filter = + vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)(src); + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + uint16x4_t d0 = + highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max); + uint16x4_t d1 = + highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max); + uint16x4_t d2 = + highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max); + uint16x4_t d3 = + highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)(src); + uint16_t *d = dst; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample + // permute required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + uint16x8_t d0 = + highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max); + uint16x8_t d1 = + highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max); + uint16x8_t d2 = + highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max); + uint16x8_t d3 = + highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params, bd); + return; + } + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + + if (x_filter_taps == 6 || y_filter_taps == 6) { + av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } + + const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; + + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + const int x_offset = (1 << (bd + FILTER_BITS - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a + // simple shift left instead of a rounding saturating shift left. + const int y_offset = + (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int im_h = h + clamped_y_taps - 1; + + if (x_filter_taps > 8) { + highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset); + + highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + return; + } + + if (x_filter_taps <= 4) { + highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset); + } else { + highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset); + } + + if (y_filter_taps <= 4) { + highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } else { + highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } +}
diff --git a/av1/common/arm/highbd_convolve_sve2.h b/av1/common/arm/highbd_convolve_sve2.h new file mode 100644 index 0000000..05e23de --- /dev/null +++ b/av1/common/arm/highbd_convolve_sve2.h
@@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_ + +#include <arm_neon.h> + +#include "aom_dsp/arm/aom_neon_sve2_bridge.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 0, 5, 6, 7, 4, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 0, 1, 6, 7, 4, 5, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 0, 1, 2, 7, 4, 5, 6, +}; +// clang-format on + +static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, + int16x4_t s2, int16x4_t s3, + int16x8_t res[2]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); + + int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); + int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); + + int32x4x2_t s0123 = vzipq_s32(s01, s23); + + res[0] = vreinterpretq_s16_s32(s0123.val[0]); + res[1] = vreinterpretq_s16_s32(s0123.val[1]); +} + +static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t s3, + int16x8_t res[4]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // res[2]: 04 14 24 34 05 15 25 35 + // res[3]: 06 16 26 36 07 17 27 37 + + int16x8x2_t tr01_16 = vzipq_s16(s0, s1); + int16x8x2_t tr23_16 = vzipq_s16(s2, s3); + int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]), + vreinterpretq_s32_s16(tr23_16.val[0])); + int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]), + vreinterpretq_s32_s16(tr23_16.val[1])); + + res[0] = vreinterpretq_s16_s32(tr01_32.val[0]); + res[1] = vreinterpretq_s16_s32(tr01_32.val[1]); + res[2] = vreinterpretq_s16_s32(tr23_32.val[0]); + res[3] = vreinterpretq_s16_s32(tr23_32.val[1]); +} + +static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], + uint16x8_t tbl, int16x8_t res[4]) { + res[0] = aom_tbl2_s16(t0[0], t1[0], tbl); + res[1] = aom_tbl2_s16(t0[1], t1[1], tbl); + res[2] = aom_tbl2_s16(t0[2], t1[2], tbl); + res[3] = aom_tbl2_s16(t0[3], t1[3], tbl); +} + +static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], + uint16x8_t tbl, int16x8_t res[2]) { + res[0] = aom_tbl2_s16(t0[0], t1[0], tbl); + res[1] = aom_tbl2_s16(t0[1], t1[1], tbl); +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_
diff --git a/av1/common/arm/highbd_reconinter_neon.c b/av1/common/arm/highbd_reconinter_neon.c index 573d3c1..da7f6c5 100644 --- a/av1/common/arm/highbd_reconinter_neon.c +++ b/av1/common/arm/highbd_reconinter_neon.c
@@ -113,8 +113,7 @@ vget_low_u8(max_alpha)); } - store_u8_4x1(mask, m, 0); - store_u8_4x1(mask + w, m, 1); + store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -205,8 +204,7 @@ vget_low_u8(max_alpha)); } - store_u8_4x1(mask, m, 0); - store_u8_4x1(mask + w, m, 1); + store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -298,8 +296,7 @@ vget_low_u8(max_alpha)); } - store_u8_4x1(mask, m, 0); - store_u8_4x1(mask + w, m, 1); + store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride;
diff --git a/av1/common/arm/highbd_reconintra_neon.c b/av1/common/arm/highbd_reconintra_neon.c index 170491b..8fd4a99 100644 --- a/av1/common/arm/highbd_reconintra_neon.c +++ b/av1/common/arm/highbd_reconintra_neon.c
@@ -13,6 +13,7 @@ #include <assert.h> #include "aom_dsp/arm/sum_neon.h" +#include "config/av1_rtcd.h" #define MAX_UPSAMPLE_SZ 16
diff --git a/av1/common/arm/highbd_warp_plane_neon.c b/av1/common/arm/highbd_warp_plane_neon.c index 0729df6..51bf142 100644 --- a/av1/common/arm/highbd_warp_plane_neon.c +++ b/av1/common/arm/highbd_warp_plane_neon.c
@@ -21,65 +21,14 @@ #include "av1/common/scale.h" #include "av1/common/warped_motion.h" #include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" -static INLINE int16x8_t load_filters_1(int ofs) { - const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); - - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; - return vld1q_s16(base + ofs0 * 8); -} - -static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) { - const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); - const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); - const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); - const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); - - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; - out[0] = vld1q_s16(base + ofs0 * 8); - out[1] = vld1q_s16(base + ofs1 * 8); - out[2] = vld1q_s16(base + ofs2 * 8); - out[3] = vld1q_s16(base + ofs3 * 8); -} - -static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) { - const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); - const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); - const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); - const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); - const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); - const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); - const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); - const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); - - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; - out[0] = vld1q_s16(base + ofs0 * 8); - out[1] = vld1q_s16(base + ofs1 * 8); - out[2] = vld1q_s16(base + ofs2 * 8); - out[3] = vld1q_s16(base + ofs3 * 8); - out[4] = vld1q_s16(base + ofs4 * 8); - out[5] = vld1q_s16(base + ofs5 * 8); - out[6] = vld1q_s16(base + ofs6 * 8); - out[7] = vld1q_s16(base + ofs7 * 8); -} - -static INLINE int16x8_t warp_affine_horizontal_step_4x1_f4_neon( - int bd, int sx, int alpha, uint16x8x2_t in) { +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, + int16x8_t rv3, int bd, int sx, int alpha) { int16x8_t f[4]; load_filters_4(f, sx, alpha); - int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 0); - int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 1); - int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 2); - int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 3); - int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); @@ -100,31 +49,12 @@ return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } -static INLINE int16x8_t warp_affine_horizontal_step_8x1_f8_neon( - int bd, int sx, int alpha, uint16x8x2_t in) { - const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; - const int offset_bits_horiz = bd + FILTER_BITS - 1; - +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, + int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha) { int16x8_t f[8]; load_filters_8(f, sx, alpha); - int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 0); - int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 1); - int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 2); - int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 3); - int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 4); - int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 5); - int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 6); - int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), - vreinterpretq_s16_u16(in.val[1]), 7); - int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); @@ -145,6 +75,9 @@ int32x4_t m0123[] = { m0, m1, m2, m3 }; int32x4_t m4567[] = { m4, m5, m6, m7 }; + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); int32x4_t res1 = horizontal_add_4d_s32x4(m4567); res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); @@ -154,78 +87,70 @@ return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } -static INLINE void warp_affine_horizontal_neon(const uint16_t *ref, int width, - int height, int stride, - int p_width, int16_t alpha, - int16_t beta, int iy4, int sx4, - int ix4, int16x8_t tmp[], - int bd) { +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f1(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, + int16x8_t rv3, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; - if (ix4 <= -7) { - for (int k = 0; k < 15; ++k) { - int iy = clamp(iy4 + k - 7, 0, height - 1); - int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + - ref[iy * stride] * (1 << (FILTER_BITS - round0)); - tmp[k] = vdupq_n_s16(dup_val); - } - return; - } else if (ix4 >= width + 6) { - for (int k = 0; k < 15; ++k) { - int iy = clamp(iy4 + k - 7, 0, height - 1); - int32_t dup_val = - (1 << (bd + FILTER_BITS - round0 - 1)) + - ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); - tmp[k] = vdupq_n_s16(dup_val); - } - return; - } - - for (int k = 0; k < 15; ++k) { - const int iy = clamp(iy4 + k - 7, 0, height - 1); - uint16x8x2_t in = vld1q_u16_x2(ref + iy * stride + ix4 - 7); - - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; - - const uint16_t k0[16] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15 }; - const uint16x8_t indx0 = vld1q_u16(&k0[0]); - const uint16x8_t indx1 = vld1q_u16(&k0[8]); - - if (out_of_boundary_left >= 0) { - uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); - uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); - uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); - uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); - in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]); - in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]); - } - if (out_of_boundary_right >= 0) { - uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); - uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); - uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); - uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); - in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]); - in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]); - } - - const int sx = sx4 + beta * (k - 3); - if (p_width == 4) { - tmp[k] = warp_affine_horizontal_step_4x1_f4_neon(bd, sx, alpha, in); - } else { - tmp[k] = warp_affine_horizontal_step_8x1_f8_neon(bd, sx, alpha, in); - } - } + int32x4_t res = horizontal_add_4d_s32x4(m0123); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } -static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) { - const int limit = (1 << bd) - 1; - return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, + int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + int32x4_t m4 = vmull_s16(vget_low_s16(f), vget_low_s16(rv4)); + m4 = vmlal_s16(m4, vget_high_s16(f), vget_high_s16(rv4)); + int32x4_t m5 = vmull_s16(vget_low_s16(f), vget_low_s16(rv5)); + m5 = vmlal_s16(m5, vget_high_s16(f), vget_high_s16(rv5)); + int32x4_t m6 = vmull_s16(vget_low_s16(f), vget_low_s16(rv6)); + m6 = vmlal_s16(m6, vget_high_s16(f), vget_high_s16(rv6)); + int32x4_t m7 = vmull_s16(vget_low_s16(f), vget_low_s16(rv7)); + m7 = vmlal_s16(m7, vget_high_s16(f), vget_high_s16(rv7)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); + int32x4_t res1 = horizontal_add_4d_s32x4(m4567); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } -static INLINE int32x4_t -warp_affine_vertical_filter_4x1_f1_neon(const int16x8_t *tmp, int sy) { +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, + int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); @@ -241,8 +166,8 @@ return m0123; } -static INLINE int32x4x2_t -warp_affine_vertical_filter_8x1_f1_neon(const int16x8_t *tmp, int sy) { +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, + int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); @@ -267,8 +192,8 @@ return (int32x4x2_t){ { m0123, m4567 } }; } -static INLINE int32x4_t warp_affine_vertical_filter_4x1_f4_neon( - const int16x8_t *tmp, int sy, int gamma) { +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, + int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), @@ -291,8 +216,8 @@ return horizontal_add_4d_s32x4(m0123); } -static INLINE int32x4x2_t warp_affine_vertical_filter_8x1_f8_neon( - const int16x8_t *tmp, int sy, int gamma) { +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, + int sy, int gamma) { int16x8_t s0 = tmp[0]; int16x8_t s1 = tmp[1]; int16x8_t s2 = tmp[2]; @@ -332,165 +257,6 @@ return ret; } -static INLINE void warp_affine_vertical_step_4x1_f4_neon( - uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, - bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, - int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { - int32x4_t sum0 = - gamma == 0 ? warp_affine_vertical_filter_4x1_f1_neon(tmp, sy) - : warp_affine_vertical_filter_4x1_f4_neon(tmp, sy, gamma); - - const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; - const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; - - sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); - - uint16_t *dst16 = &pred[i * p_stride + j]; - - if (!is_compound) { - const int reduce_bits_vert = 2 * FILTER_BITS - round0; - sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); - - const int res_sub_const = (1 << (bd - 1)) + (1 << bd); - sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); - uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); - vst1_u16(dst16, res0); - return; - } - - sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); - - uint16_t *p = &dst[i * dst_stride + j]; - - if (!do_average) { - vst1_u16(p, vqmovun_s32(sum0)); - return; - } - - uint16x4_t p0 = vld1_u16(p); - int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); - if (use_dist_wtd_comp_avg) { - p_vec0 = vmulq_n_s32(p_vec0, fwd); - p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); - p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); - } else { - p_vec0 = vhaddq_s32(p_vec0, sum0); - } - - const int offset_bits = bd + 2 * FILTER_BITS - round0; - const int round1 = COMPOUND_ROUND1_BITS; - const int res_sub_const = - (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); - const int round_bits = 2 * FILTER_BITS - round0 - round1; - - p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); - p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); - uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); - vst1_u16(dst16, res0); -} - -static INLINE void warp_affine_vertical_step_8x1_f8_neon( - uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, - bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, - int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { - int32x4x2_t sums = - gamma == 0 ? warp_affine_vertical_filter_8x1_f1_neon(tmp, sy) - : warp_affine_vertical_filter_8x1_f8_neon(tmp, sy, gamma); - int32x4_t sum0 = sums.val[0]; - int32x4_t sum1 = sums.val[1]; - - const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; - const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; - - sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); - sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); - - uint16_t *dst16 = &pred[i * p_stride + j]; - - if (!is_compound) { - const int reduce_bits_vert = 2 * FILTER_BITS - round0; - sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); - sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); - - const int res_sub_const = (1 << (bd - 1)) + (1 << bd); - sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); - sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); - uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); - uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); - vst1_u16(dst16, res0); - vst1_u16(dst16 + 4, res1); - return; - } - - sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); - sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); - - uint16_t *p = &dst[i * dst_stride + j]; - - if (!do_average) { - vst1_u16(p, vqmovun_s32(sum0)); - vst1_u16(p + 4, vqmovun_s32(sum1)); - return; - } - - uint16x8_t p0 = vld1q_u16(p); - int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); - int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); - if (use_dist_wtd_comp_avg) { - p_vec0 = vmulq_n_s32(p_vec0, fwd); - p_vec1 = vmulq_n_s32(p_vec1, fwd); - p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); - p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); - p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); - p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); - } else { - p_vec0 = vhaddq_s32(p_vec0, sum0); - p_vec1 = vhaddq_s32(p_vec1, sum1); - } - - const int offset_bits = bd + 2 * FILTER_BITS - round0; - const int round1 = COMPOUND_ROUND1_BITS; - const int res_sub_const = - (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); - const int round_bits = 2 * FILTER_BITS - round0 - round1; - - p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); - p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); - - p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); - p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); - uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); - uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); - vst1_u16(dst16, res0); - vst1_u16(dst16 + 4, res1); -} - -static INLINE void warp_affine_vertical_neon( - uint16_t *pred, int p_width, int p_height, int p_stride, int bd, - uint16_t *dst, int dst_stride, bool is_compound, bool do_average, - bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, - const int16x8_t *tmp, int i, int sy4, int j) { - int limit_height = p_height > 4 ? 8 : 4; - - if (p_width > 4) { - // p_width == 8 - for (int k = 0; k < limit_height; ++k) { - int sy = sy4 + delta * k; - warp_affine_vertical_step_8x1_f8_neon( - pred, p_stride, bd, dst, dst_stride, is_compound, do_average, - use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); - } - } else { - // p_width == 4 - for (int k = 0; k < limit_height; ++k) { - int sy = sy4 + delta * k; - warp_affine_vertical_step_4x1_f4_neon( - pred, p_stride, bd, dst, dst_stride, is_compound, do_average, - use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); - } - } -} - void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, @@ -498,63 +264,8 @@ int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { - uint16_t *const dst = conv_params->dst; - const int dst_stride = conv_params->dst_stride; - const bool is_compound = conv_params->is_compound; - const bool do_average = conv_params->do_average; - const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - const int fwd = conv_params->fwd_offset; - const int bwd = conv_params->bck_offset; - - assert(IMPLIES(is_compound, dst != NULL)); - - for (int i = 0; i < p_height; i += 8) { - for (int j = 0; j < p_width; j += 8) { - // Calculate the center of this 8x8 block, - // project to luma coordinates (if in a subsampled chroma plane), - // apply the affine transformation, - // then convert back to the original coordinates (if necessary) - const int32_t src_x = (j + 4 + p_col) << subsampling_x; - const int32_t src_y = (i + 4 + p_row) << subsampling_y; - const int64_t dst_x = - (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; - const int64_t dst_y = - (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; - const int64_t x4 = dst_x >> subsampling_x; - const int64_t y4 = dst_y >> subsampling_y; - - const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); - int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); - int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - - sx4 += alpha * (-4) + beta * (-4); - sy4 += gamma * (-4) + delta * (-4); - - sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - - // Each horizontal filter result is formed by the sum of up to eight - // multiplications by filter values and then a shift. Although both the - // inputs and filters are loaded as int16, the input data is at most bd - // bits and the filters are at most 8 bits each. Additionally since we - // know all possible filter values we know that the sum of absolute - // filter values will fit in at most 9 bits. With this in mind we can - // conclude that the sum of each filter application will fit in bd + 9 - // bits. The shift following the summation is ROUND0_BITS (which is 3), - // +2 for 12-bit, which gives us a final storage of: - // bd == 8: ( 8 + 9) - 3 => 14 bits - // bd == 10: (10 + 9) - 3 => 16 bits - // bd == 12: (12 + 9) - 5 => 16 bits - // So it is safe to use int16x8_t as the intermediate storage type here. - int16x8_t tmp[15]; - - warp_affine_horizontal_neon(ref, width, height, stride, p_width, alpha, - beta, iy4, sx4, ix4, tmp, bd); - warp_affine_vertical_neon(pred, p_width, p_height, p_stride, bd, dst, - dst_stride, is_compound, do_average, - use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, - tmp, i, sy4, j); - } - } + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); }
diff --git a/av1/common/arm/highbd_warp_plane_neon.h b/av1/common/arm/highbd_warp_plane_neon.h new file mode 100644 index 0000000..2ec45d1 --- /dev/null +++ b/av1/common/arm/highbd_warp_plane_neon.h
@@ -0,0 +1,503 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ + +#include <arm_neon.h> +#include <assert.h> +#include <stdbool.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, + int16x8_t rv3, int bd, int sx, int alpha); + +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, + int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha); + +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f1( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx); + +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, + int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx); + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, + int sy); + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, + int sy); + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, + int sy, int gamma); + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, + int sy, int gamma); + +static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + return vld1q_s16(base + ofs0 * 8); +} + +static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs, + int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); +} + +static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs, + int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); + const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); + const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); + const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); + out[4] = vld1q_s16(base + ofs4 * 8); + out[5] = vld1q_s16(base + ofs5 * 8); + out[6] = vld1q_s16(base + ofs6 * 8); + out[7] = vld1q_s16(base + ofs7 * 8); +} + +static AOM_FORCE_INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, + int bd) { + const int limit = (1 << bd) - 1; + return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); +} + +static AOM_FORCE_INLINE uint16x8x2_t clamp_horizontal( + uint16x8x2_t src_1, int out_of_boundary_left, int out_of_boundary_right, + const uint16_t *ref, int iy, int stride, int width, const uint16x8_t indx0, + const uint16x8_t indx1) { + if (out_of_boundary_left >= 0) { + uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); + uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); + uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); + } + if (out_of_boundary_right >= 0) { + uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); + uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); + uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); + } + return src_1; +} + +static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref, + int width, int height, + int stride, int p_width, + int16_t alpha, int16_t beta, + int iy4, int sx4, int ix4, + int16x8_t tmp[], int bd) { + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + + if (ix4 <= -7) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } else if (ix4 >= width + 6) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = + (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } + + static const uint16_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint16x8_t indx0 = vld1q_u16(kIotaArr); + const uint16x8_t indx1 = vld1q_u16(kIotaArr + 8); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT_4X1(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + src_1 = clamp_horizontal(src_1, out_of_boundary_left, \ + out_of_boundary_right, ref, iy, stride, \ + width, indx0, indx1); \ + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 0); \ + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 1); \ + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 2); \ + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 3); \ + tmp[k] = (fn)(rv0, rv1, rv2, rv3, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + const uint16_t *src = ref + iy * stride + ix4; \ + int16x8_t rv0 = vreinterpretq_s16_u16(vld1q_u16(src - 7)); \ + int16x8_t rv1 = vreinterpretq_s16_u16(vld1q_u16(src - 6)); \ + int16x8_t rv2 = vreinterpretq_s16_u16(vld1q_u16(src - 5)); \ + int16x8_t rv3 = vreinterpretq_s16_u16(vld1q_u16(src - 4)); \ + tmp[k] = (fn)(rv0, rv1, rv2, rv3, __VA_ARGS__); \ + } \ + } \ + } while (0) + +#define APPLY_HORIZONTAL_SHIFT_8X1(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + src_1 = clamp_horizontal(src_1, out_of_boundary_left, \ + out_of_boundary_right, ref, iy, stride, \ + width, indx0, indx1); \ + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 0); \ + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 1); \ + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 2); \ + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 3); \ + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 4); \ + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 5); \ + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 6); \ + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ + vreinterpretq_s16_u16(src_1.val[1]), 7); \ + tmp[k] = (fn)(rv0, rv1, rv2, rv3, rv4, rv5, rv6, rv7, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + const uint16_t *src = ref + iy * stride + ix4; \ + int16x8_t rv0 = vreinterpretq_s16_u16(vld1q_u16(src - 7)); \ + int16x8_t rv1 = vreinterpretq_s16_u16(vld1q_u16(src - 6)); \ + int16x8_t rv2 = vreinterpretq_s16_u16(vld1q_u16(src - 5)); \ + int16x8_t rv3 = vreinterpretq_s16_u16(vld1q_u16(src - 4)); \ + int16x8_t rv4 = vreinterpretq_s16_u16(vld1q_u16(src - 3)); \ + int16x8_t rv5 = vreinterpretq_s16_u16(vld1q_u16(src - 2)); \ + int16x8_t rv6 = vreinterpretq_s16_u16(vld1q_u16(src - 1)); \ + int16x8_t rv7 = vreinterpretq_s16_u16(vld1q_u16(src - 0)); \ + tmp[k] = (fn)(rv0, rv1, rv2, rv3, rv4, rv5, rv6, rv7, __VA_ARGS__); \ + } \ + } \ + } while (0) + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f4, bd, sx4, + alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f4, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f8, bd, sx4, + alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f8, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } + +#undef APPLY_HORIZONTAL_SHIFT_4X1 +#undef APPLY_HORIZONTAL_SHIFT_8X1 +} + +static AOM_FORCE_INLINE void highbd_vertical_filter_4x1_f4( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4_t sum0 = gamma == 0 ? vertical_filter_4x1_f1(tmp, sy) + : vertical_filter_4x1_f4(tmp, sy, gamma); + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + vst1_u16(dst16, res0); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + return; + } + + uint16x4_t p0 = vld1_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + vst1_u16(dst16, res0); +} + +static AOM_FORCE_INLINE void highbd_vertical_filter_8x1_f8( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4x2_t sums = gamma == 0 ? vertical_filter_8x1_f1(tmp, sy) + : vertical_filter_8x1_f8(tmp, sy, gamma); + int32x4_t sum0 = sums.val[0]; + int32x4_t sum1 = sums.val[1]; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + vst1_u16(p + 4, vqmovun_s32(sum1)); + return; + } + + uint16x8_t p0 = vld1q_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); + int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec1 = vmulq_n_s32(p_vec1, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + p_vec1 = vhaddq_s32(p_vec1, sum1); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); + + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); +} + +static AOM_FORCE_INLINE void warp_affine_vertical( + uint16_t *pred, int p_width, int p_height, int p_stride, int bd, + uint16_t *dst, int dst_stride, bool is_compound, bool do_average, + bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, + const int16x8_t *tmp, int i, int sy4, int j) { + int limit_height = p_height > 4 ? 8 : 4; + + if (p_width > 4) { + // p_width == 8 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_8x1_f8( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } else { + // p_width == 4 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_4x1_f4( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } +} + +static AOM_FORCE_INLINE void highbd_warp_affine_common( + const int32_t *mat, const uint16_t *ref, int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const bool is_compound = conv_params->is_compound; + const bool do_average = conv_params->do_average; + const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fwd = conv_params->fwd_offset; + const int bwd = conv_params->bck_offset; + + assert(IMPLIES(is_compound, dst != NULL)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4 + p_col) << subsampling_x; + const int32_t src_y = (i + 4 + p_row) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Each horizontal filter result is formed by the sum of up to eight + // multiplications by filter values and then a shift. Although both the + // inputs and filters are loaded as int16, the input data is at most bd + // bits and the filters are at most 8 bits each. Additionally since we + // know all possible filter values we know that the sum of absolute + // filter values will fit in at most 9 bits. With this in mind we can + // conclude that the sum of each filter application will fit in bd + 9 + // bits. The shift following the summation is ROUND0_BITS (which is 3), + // +2 for 12-bit, which gives us a final storage of: + // bd == 8: ( 8 + 9) - 3 => 14 bits + // bd == 10: (10 + 9) - 3 => 16 bits + // bd == 12: (12 + 9) - 5 => 16 bits + // So it is safe to use int16x8_t as the intermediate storage type here. + int16x8_t tmp[15]; + + warp_affine_horizontal(ref, width, height, stride, p_width, alpha, beta, + iy4, sx4, ix4, tmp, bd); + warp_affine_vertical(pred, p_width, p_height, p_stride, bd, dst, + dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, tmp, + i, sy4, j); + } + } +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_
diff --git a/av1/common/arm/highbd_warp_plane_sve.c b/av1/common/arm/highbd_warp_plane_sve.c new file mode 100644 index 0000000..c2e1e99 --- /dev/null +++ b/av1/common/arm/highbd_warp_plane_sve.c
@@ -0,0 +1,247 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <stdbool.h> +#include <arm_neon_sve_bridge.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, + int16x8_t rv3, int bd, int sx, int alpha) { + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, + int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha) { + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f1(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, + int16x8_t rv3, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1( + int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, + int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, + int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + // No benefit to using SDOT here, the cost of rearrangement is too high. + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + return m0123; +} + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, + int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + // No benefit to using SDOT here, the cost of rearrangement is too high. + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); + return (int32x4x2_t){ { m0123, m4567 } }; +} + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), + vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), + vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + return vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); +} + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, + int sy, int gamma) { + int16x8_t s0 = tmp[0]; + int16x8_t s1 = tmp[1]; + int16x8_t s2 = tmp[2]; + int16x8_t s3 = tmp[3]; + int16x8_t s4 = tmp[4]; + int16x8_t s5 = tmp[5]; + int16x8_t s6 = tmp[6]; + int16x8_t s7 = tmp[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + int32x4x2_t ret; + ret.val[0] = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + ret.val[1] = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + return ret; +} + +void av1_highbd_warp_affine_sve(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +}
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c index cf488a9..d31c4a9 100644 --- a/av1/common/arm/reconintra_neon.c +++ b/av1/common/arm/reconintra_neon.c
@@ -13,146 +13,200 @@ #include <assert.h> #include "config/aom_config.h" +#include "config/av1_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #define MAX_UPSAMPLE_SZ 16 -DECLARE_ALIGNED(16, const int8_t, - av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = { +// These kernels are a transposed version of those defined in reconintra.c, +// with the absolute value of the negatives taken in the top row. +DECLARE_ALIGNED(16, const uint8_t, + av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = { + // clang-format off { - { -6, 0, 0, 0, -5, 10, 0, 0 }, - { 10, 0, 12, 0, 2, 0, 9, 0 }, - { -3, 1, 0, 0, -3, 1, 10, 0 }, - { 1, 10, 7, 0, 1, 2, 5, 0 }, - { -4, 0, 0, 12, -3, 6, 0, 9 }, - { 6, 0, 2, 0, 2, 0, 2, 0 }, - { -3, 2, 0, 7, -3, 2, 6, 5 }, - { 2, 6, 2, 0, 1, 2, 3, 0 }, + { 6, 5, 3, 3, 4, 3, 3, 3 }, + { 10, 2, 1, 1, 6, 2, 2, 1 }, + { 0, 10, 1, 1, 0, 6, 2, 2 }, + { 0, 0, 10, 2, 0, 0, 6, 2 }, + { 0, 0, 0, 10, 0, 0, 0, 6 }, + { 12, 9, 7, 5, 2, 2, 2, 3 }, + { 0, 0, 0, 0, 12, 9, 7, 5 } }, { - { -10, 0, 0, 0, -6, 16, 0, 0 }, - { 16, 0, 10, 0, 0, 0, 6, 0 }, - { -4, 0, 0, 0, -2, 0, 16, 0 }, - { 0, 16, 4, 0, 0, 0, 2, 0 }, - { -10, 0, 0, 10, -6, 16, 0, 6 }, - { 16, 0, 0, 0, 0, 0, 0, 0 }, - { -4, 0, 0, 4, -2, 0, 16, 2 }, - { 0, 16, 0, 0, 0, 0, 0, 0 }, + { 10, 6, 4, 2, 10, 6, 4, 2 }, + { 16, 0, 0, 0, 16, 0, 0, 0 }, + { 0, 16, 0, 0, 0, 16, 0, 0 }, + { 0, 0, 16, 0, 0, 0, 16, 0 }, + { 0, 0, 0, 16, 0, 0, 0, 16 }, + { 10, 6, 4, 2, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 10, 6, 4, 2 } }, { - { -8, 0, 0, 0, -8, 8, 0, 0 }, - { 8, 0, 16, 0, 0, 0, 16, 0 }, - { -8, 0, 0, 0, -8, 0, 8, 0 }, - { 0, 8, 16, 0, 0, 0, 16, 0 }, - { -4, 0, 0, 16, -4, 4, 0, 16 }, - { 4, 0, 0, 0, 0, 0, 0, 0 }, - { -4, 0, 0, 16, -4, 0, 4, 16 }, - { 0, 4, 0, 0, 0, 0, 0, 0 }, + { 8, 8, 8, 8, 4, 4, 4, 4 }, + { 8, 0, 0, 0, 4, 0, 0, 0 }, + { 0, 8, 0, 0, 0, 4, 0, 0 }, + { 0, 0, 8, 0, 0, 0, 4, 0 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 16, 16, 16, 16, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16, 16, 16, 16 } }, { - { -2, 0, 0, 0, -1, 8, 0, 0 }, - { 8, 0, 10, 0, 3, 0, 6, 0 }, - { -1, 3, 0, 0, 0, 2, 8, 0 }, - { 2, 8, 4, 0, 1, 3, 2, 0 }, - { -1, 0, 0, 10, -1, 4, 0, 6 }, - { 4, 0, 3, 0, 3, 0, 4, 0 }, - { -1, 3, 0, 4, -1, 2, 4, 3 }, - { 2, 4, 4, 0, 2, 3, 3, 0 }, + { 2, 1, 1, 0, 1, 1, 1, 1 }, + { 8, 3, 2, 1, 4, 3, 2, 2 }, + { 0, 8, 3, 2, 0, 4, 3, 2 }, + { 0, 0, 8, 3, 0, 0, 4, 3 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 10, 6, 4, 2, 3, 4, 4, 3 }, + { 0, 0, 0, 0, 10, 6, 4, 3 } }, { - { -12, 0, 0, 0, -10, 14, 0, 0 }, - { 14, 0, 14, 0, 0, 0, 12, 0 }, - { -9, 0, 0, 0, -8, 0, 14, 0 }, - { 0, 14, 11, 0, 0, 0, 10, 0 }, - { -10, 0, 0, 14, -9, 12, 0, 12 }, - { 12, 0, 0, 0, 1, 0, 0, 0 }, - { -8, 0, 0, 11, -7, 0, 12, 9 }, - { 0, 12, 1, 0, 0, 1, 1, 0 }, - }, + { 12, 10, 9, 8, 10, 9, 8, 7 }, + { 14, 0, 0, 0, 12, 1, 0, 0 }, + { 0, 14, 0, 0, 0, 12, 0, 0 }, + { 0, 0, 14, 0, 0, 0, 12, 1 }, + { 0, 0, 0, 14, 0, 0, 0, 12 }, + { 14, 12, 11, 10, 0, 0, 1, 1 }, + { 0, 0, 0, 0, 14, 12, 11, 9 } + } + // clang-format on }; #define FILTER_INTRA_SCALE_BITS 4 -#define SHIFT_INTRA_SCALE_BITS 15 - FILTER_INTRA_SCALE_BITS - -#define MASK_LOW \ - 0x604020006040200 // (0 | (2 << 8) | (4 << 16) | (6 << 24)) x 2 -#define MASK_HIGH \ - 0x705030107050301 // (1 | (3 << 8) | (5 << 16) | (7 << 24)) x 2 void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode) { - int r, c; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(width <= 32 && height <= 32); + + const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]); + const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]); + const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]); + const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]); + const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]); + const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]); + const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]); + uint8_t buffer[33][33]; - const int bw = tx_size_wide[tx_size]; - const int bh = tx_size_high[tx_size]; + // Populate the top row in the scratch buffer with data from above. + memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t)); + // Populate the first column in the scratch buffer with data from the left. + int r = 0; + do { + buffer[r + 1][0] = left[r]; + } while (++r < height); - const int8x16_t f1f0 = vld1q_s8(av1_filter_intra_taps_neon[mode][0]); - const int8x16_t f3f2 = vld1q_s8(av1_filter_intra_taps_neon[mode][2]); - const int8x16_t f5f4 = vld1q_s8(av1_filter_intra_taps_neon[mode][4]); - const int8x16_t f7f6 = vld1q_s8(av1_filter_intra_taps_neon[mode][6]); - const int16x8_t f1f0_lo = vmovl_s8(vget_low_s8(f1f0)); - const int16x8_t f1f0_hi = vmovl_s8(vget_high_s8(f1f0)); - const int16x8_t f3f2_lo = vmovl_s8(vget_low_s8(f3f2)); - const int16x8_t f3f2_hi = vmovl_s8(vget_high_s8(f3f2)); - const int16x8_t f5f4_lo = vmovl_s8(vget_low_s8(f5f4)); - const int16x8_t f5f4_hi = vmovl_s8(vget_high_s8(f5f4)); - const int16x8_t f7f6_lo = vmovl_s8(vget_low_s8(f7f6)); - const int16x8_t f7f6_hi = vmovl_s8(vget_high_s8(f7f6)); - const uint8x8_t vmask_low = vcreate_u8(MASK_LOW); - const uint8x8_t vmask_high = vcreate_u8(MASK_HIGH); + // Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster. + if (width <= 8) { + r = 1; + do { + int c = 1; + uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]); - assert(bw <= 32 && bh <= 32); + do { + uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1 = vdup_lane_u8(s1234, 0); + uint8x8_t s2 = vdup_lane_u8(s1234, 1); + uint8x8_t s3 = vdup_lane_u8(s1234, 2); + uint8x8_t s4 = vdup_lane_u8(s1234, 3); - for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; - memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); + uint16x8_t sum = vmull_u8(s1, f1); + // First row of each filter has all negative values so subtract. + sum = vmlsl_u8(sum, s0, f0); + sum = vmlal_u8(sum, s2, f2); + sum = vmlal_u8(sum, s3, f3); + sum = vmlal_u8(sum, s4, f4); + sum = vmlal_u8(sum, s5, f5); + sum = vmlal_u8(sum, s6, f6); - for (r = 1; r < bh + 1; r += 2) { - for (c = 1; c < bw + 1; c += 4) { - DECLARE_ALIGNED(16, uint8_t, p[8]); - memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t)); - p[5] = buffer[r][c - 1]; - p[6] = buffer[r + 1][c - 1]; - p[7] = 0; + uint8x8_t res = + vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS); - const uint8x8_t p_b = vld1_u8(p); + // Store buffer[r + 0][c] and buffer[r + 1][c]. + store_u8x4_strided_x2(&buffer[r][c], 33, res); - const uint16x8_t p_b_lo = vmovl_u8(vtbl1_u8(p_b, vmask_low)); - const uint16x8_t p_b_hi = vmovl_u8(vtbl1_u8(p_b, vmask_high)); + store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res); - int16x8_t out_01 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f1f0_lo); - out_01 = vmlaq_s16(out_01, vreinterpretq_s16_u16(p_b_hi), f1f0_hi); - int16x8_t out_23 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f3f2_lo); - out_23 = vmlaq_s16(out_23, vreinterpretq_s16_u16(p_b_hi), f3f2_hi); - int16x8_t out_45 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f5f4_lo); - out_45 = vmlaq_s16(out_45, vreinterpretq_s16_u16(p_b_hi), f5f4_hi); - int16x8_t out_67 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f7f6_lo); - out_67 = vmlaq_s16(out_67, vreinterpretq_s16_u16(p_b_hi), f7f6_hi); -#if AOM_ARCH_AARCH64 - const int16x8_t out_0123 = vpaddq_s16(out_01, out_23); - const int16x8_t out_4567 = vpaddq_s16(out_45, out_67); - const int16x8_t out_01234567 = vpaddq_s16(out_0123, out_4567); -#else - const int16x8_t out_0123 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_01)), - vqmovn_s32(vpaddlq_s16(out_23))); - const int16x8_t out_4567 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_45)), - vqmovn_s32(vpaddlq_s16(out_67))); - const int16x8_t out_01234567 = vcombine_s16( - vqmovn_s32(vpaddlq_s16(out_0123)), vqmovn_s32(vpaddlq_s16(out_4567))); -#endif // AOM_ARCH_AARCH64 - const uint32x2_t out_r = - vreinterpret_u32_u8(vqmovun_s16(vrshrq_n_s16(out_01234567, 4))); - // Storing - vst1_lane_u32((uint32_t *)&buffer[r][c], out_r, 0); - vst1_lane_u32((uint32_t *)&buffer[r + 1][c], out_r, 1); - } - } + s0 = s4; + s5 = vdup_lane_u8(res, 3); + s6 = vdup_lane_u8(res, 7); + c += 4; + } while (c < width + 1); - for (r = 0; r < bh; ++r) { - memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); - dst += stride; + r += 2; + } while (r < height + 1); + } else { + r = 1; + do { + int c = 1; + uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); + uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); + uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); + uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); + + uint16x8_t sum_lo = vmull_u8(s1_lo, f1); + // First row of each filter has all negative values so subtract. + sum_lo = vmlsl_u8(sum_lo, s0_lo, f0); + sum_lo = vmlal_u8(sum_lo, s2_lo, f2); + sum_lo = vmlal_u8(sum_lo, s3_lo, f3); + sum_lo = vmlal_u8(sum_lo, s4_lo, f4); + sum_lo = vmlal_u8(sum_lo, s5_lo, f5); + sum_lo = vmlal_u8(sum_lo, s6_lo, f6); + + uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo), + FILTER_INTRA_SCALE_BITS); + + uint8x8_t s0_hi = s4_lo; + uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); + uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); + uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); + uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); + uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); + uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); + + uint16x8_t sum_hi = vmull_u8(s1_hi, f1); + // First row of each filter has all negative values so subtract. + sum_hi = vmlsl_u8(sum_hi, s0_hi, f0); + sum_hi = vmlal_u8(sum_hi, s2_hi, f2); + sum_hi = vmlal_u8(sum_hi, s3_hi, f3); + sum_hi = vmlal_u8(sum_hi, s4_hi, f4); + sum_hi = vmlal_u8(sum_hi, s5_hi, f5); + sum_hi = vmlal_u8(sum_hi, s6_hi, f6); + + uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi), + FILTER_INTRA_SCALE_BITS); + + uint32x2x2_t res = + vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); + + vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0])); + vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1])); + + vst1_u8(dst + (r - 1) * stride + c - 1, + vreinterpret_u8_u32(res.val[0])); + vst1_u8(dst + (r + 0) * stride + c - 1, + vreinterpret_u8_u32(res.val[1])); + + s0_lo = s4_hi; + s5_lo = vdup_lane_u8(res_hi, 3); + s6_lo = vdup_lane_u8(res_hi, 7); + c += 8; + } while (c < width + 1); + + r += 2; + } while (r < height + 1); } }
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c index 076981b..a6d4b62 100644 --- a/av1/common/arm/resize_neon.c +++ b/av1/common/arm/resize_neon.c
@@ -16,6 +16,7 @@ #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/resize.h" #include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, @@ -929,7 +930,7 @@ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - store_u8_4x1(&temp[4 * z], d, 0); + store_u8_4x1(&temp[4 * z], d); } else { int i; for (i = 0; i < 4; ++i) { @@ -942,10 +943,10 @@ // transpose the 4x4 filters values back to dst { const uint8x8x4_t d4 = vld4_u8(temp); - store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0], 0); - store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1], 0); - store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2], 0); - store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3], 0); + store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]); + store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]); + store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]); + store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]); } x += 4; } while (x < w); @@ -1040,7 +1041,7 @@ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - store_u8_4x1(dst, d, 0); + store_u8_4x1(dst, d); } else { memcpy(dst, &src_y[3 * src_stride], w); }
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c index 1d3a3cc..08e298f 100644 --- a/av1/common/arm/selfguided_neon.c +++ b/av1/common/arm/selfguided_neon.c
@@ -1124,10 +1124,10 @@ } while (h > 0); } -void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, - int16_t *src, const int src_stride, int32_t *dst, - const int dst_stride, const int width, - const int height) { +static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, + int16_t *src, const int src_stride, + int32_t *dst, const int dst_stride, + const int width, const int height) { int16x8_t s0; int32_t *B_tmp, *dst_ptr; uint16_t *A_tmp;
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c index 4723154..546aa29 100644 --- a/av1/common/arm/warp_plane_neon.c +++ b/av1/common/arm/warp_plane_neon.c
@@ -11,8 +11,8 @@ #include "warp_plane_neon.h" -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -39,8 +39,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -75,7 +75,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -101,7 +102,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -135,8 +137,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -161,8 +163,9 @@ *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -186,9 +189,10 @@ *res = horizontal_add_4d_s32x4(m0123_pairs); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -223,10 +227,10 @@ *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2];
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h index de5e3bd..eece007 100644 --- a/av1/common/arm/warp_plane_neon.h +++ b/av1/common/arm/warp_plane_neon.h
@@ -24,32 +24,37 @@ #include "av1/common/warped_motion.h" #include "av1/common/scale.h" -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha); -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha); -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx); -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx); -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy); +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy); -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma); +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma); -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy); +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy); -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma); +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma); -static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { +static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset, + int stride) { out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> WARPEDDIFF_PREC_BITS))); out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> @@ -60,7 +65,8 @@ WARPEDDIFF_PREC_BITS))); } -static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) { +static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset, + int stride) { out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> WARPEDDIFF_PREC_BITS))); out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> @@ -79,14 +85,14 @@ WARPEDDIFF_PREC_BITS))); } -static INLINE int clamp_iy(int iy, int height) { +static AOM_FORCE_INLINE int clamp_iy(int iy, int height) { return clamp(iy, 0, height - 1); } -static INLINE void warp_affine_horizontal( +static AOM_FORCE_INLINE void warp_affine_horizontal( const uint8_t *ref, int width, int height, int stride, int p_width, int p_height, int16_t alpha, int16_t beta, const int64_t x4, - const int64_t y4, const int i, int16x8_t tmp[], const uint8x16_t indx_vec) { + const int64_t y4, const int i, int16x8_t tmp[]) { const int bd = 8; const int reduce_bits_horiz = ROUND0_BITS; const int height_limit = AOMMIN(8, p_height - i) + 7; @@ -119,92 +125,83 @@ return; } - uint8x16_t in[15]; - if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; + static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint8x16_t indx = vld1q_u8(kIotaArr); - for (int k = 0; k < height_limit; ++k) { - const int iy = clamp_iy(iy4 + k - 7, height); - const uint8_t *src = ref + iy * stride + ix4 - 7; - uint8x16_t src_1 = vld1q_u8(src); + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; - if (out_of_boundary_left >= 0) { - int limit = out_of_boundary_left + 1; - uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); - uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); - uint8x16_t mask_val = vcleq_u8(indx_vec, cmp_vec); - src_1 = vbslq_u8(mask_val, vec_dup, src_1); - } - if (out_of_boundary_right >= 0) { - int limit = 15 - (out_of_boundary_right + 1); - uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); - uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); - uint8x16_t mask_val = vcgeq_u8(indx_vec, cmp_vec); - src_1 = vbslq_u8(mask_val, vec_dup, src_1); - } - in[k] = src_1; - } - } else { - for (int k = 0; k < height_limit; ++k) { - const int iy = clamp_iy(iy4 + k - 7, height); - const uint8_t *src = ref + iy * stride + ix4 - 7; - in[k] = vld1q_u8(src); - } - } +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + \ + if (out_of_boundary_left >= 0) { \ + int limit = out_of_boundary_left + 1; \ + uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcleq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + if (out_of_boundary_right >= 0) { \ + int limit = 15 - (out_of_boundary_right + 1); \ + uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcgeq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) if (p_width == 4) { if (beta == 0) { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_4x1_f1(in[k], sx4); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, sx4); } else { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_4x1_f4(in[k], sx4, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); } } else { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_4x1_f1(in[k], sx); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, + (sx4 + beta * (k - 3))); } else { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_4x1_f4(in[k], sx, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), + alpha); } } } else { if (beta == 0) { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_8x1_f1(in[k], sx4); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, sx4); } else { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_8x1_f8(in[k], sx4, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); } } else { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_8x1_f1(in[k], sx); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, + (sx4 + beta * (k - 3))); } else { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_8x1_f8(in[k], sx, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), + alpha); } } } } -static INLINE void warp_affine_vertical( +static AOM_FORCE_INLINE void warp_affine_vertical( uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound, uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg, int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j, @@ -332,7 +329,7 @@ } } -static INLINE void av1_warp_affine_common( +static AOM_FORCE_INLINE void av1_warp_affine_common( const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, @@ -346,10 +343,6 @@ const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - static const uint8_t k0To15[16] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15 }; - const uint8x16_t indx_vec = vld1q_u8(k0To15); - assert(IMPLIES(is_compound, dst != NULL)); assert(IMPLIES(do_average, is_compound)); @@ -367,7 +360,7 @@ int16x8_t tmp[15]; warp_affine_horizontal(ref, width, height, stride, p_width, p_height, - alpha, beta, x4, y4, i, tmp, indx_vec); + alpha, beta, x4, y4, i, tmp); warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, dst_stride, do_average, use_dist_wtd_comp_avg, gamma, delta, y4, i, j, tmp, w0, w1);
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c index 39e3ad9..22a1be1 100644 --- a/av1/common/arm/warp_plane_neon_i8mm.c +++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -17,8 +17,8 @@ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -45,8 +45,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -83,7 +83,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -112,7 +113,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -149,8 +151,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -175,8 +177,9 @@ *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -200,9 +203,10 @@ *res = horizontal_add_4d_s32x4(m0123_pairs); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -237,10 +241,10 @@ *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2];
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c index 2a48c5e..c70b066 100644 --- a/av1/common/arm/warp_plane_sve.c +++ b/av1/common/arm/warp_plane_sve.c
@@ -9,9 +9,10 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "warp_plane_neon.h" +#include <arm_neon.h> -#include <arm_neon_sve_bridge.h> +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "warp_plane_neon.h" DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, @@ -19,22 +20,8 @@ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { - // The 16-bit dot product instructions only exist in SVE and not Neon. - // We can get away without rewriting the existing Neon code by making use of - // the Neon-SVE bridge intrinsics to reinterpret a Neon vector as a SVE - // vector with the high part of the vector being "don't care", and then - // operating on that instead. - // This is clearly suboptimal in machines with a SVE vector length above - // 128-bits as the remainder of the vector is wasted, however this appears to - // still be beneficial compared to not using the instruction. - return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), - svset_neonq_s16(svundef_s16(), x), - svset_neonq_s16(svundef_s16(), y))); -} - -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -61,8 +48,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -99,7 +86,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -128,7 +116,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -165,8 +154,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -191,8 +180,9 @@ *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -213,9 +203,10 @@ *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -250,10 +241,10 @@ *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2];
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h index 4c0cb99..4e14c4a 100644 --- a/av1/common/av1_common_int.h +++ b/av1/common/av1_common_int.h
@@ -17,7 +17,7 @@ #include "aom/internal/aom_codec_internal.h" #include "aom_dsp/flow_estimation/corner_detect.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/entropy.h"
diff --git a/av1/common/av1_rtcd.c b/av1/common/av1_rtcd.c index c484166..8a35dca 100644 --- a/av1/common/av1_rtcd.c +++ b/av1/common/av1_rtcd.c
@@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void av1_rtcd() { aom_once(setup_rtcd_internal); } +void av1_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 38e1da9..a24d3a9 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -77,6 +77,16 @@ } forward_decls qw/av1_common_forward_decls/; +# Fallbacks for Valgrind support +# For normal use, we require SSE4.1. However, 32-bit Valgrind does not support +# SSE4.1, so we include fallbacks for some critical functions to improve +# performance +$sse2_x86 = $ssse3_x86 = ''; +if ($opts{arch} eq "x86") { + $sse2_x86 = 'sse2'; + $ssse3_x86 = 'ssse3'; +} + # functions that are 64 bit only. $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = ''; if ($opts{arch} eq "x86_64") { @@ -245,12 +255,11 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { # directional intra predictor functions add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd"; - specialize qw/av1_highbd_dr_prediction_z1 avx2/; + specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/; add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd"; - - specialize qw/av1_highbd_dr_prediction_z2 avx2/; + specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/; add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd"; - specialize qw/av1_highbd_dr_prediction_z3 avx2/; + specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/; } # build compound seg mask functions @@ -319,10 +328,10 @@ # the transform coefficients are held in 32-bit # values, so the assembler code for av1_block_error can no longer be used. add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/av1_block_error sse2 avx2 neon/; + specialize qw/av1_block_error sse2 avx2 neon sve/; add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size"; - specialize qw/av1_block_error_lp sse2 avx2 neon/; + specialize qw/av1_block_error_lp sse2 avx2 neon sve/; add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/av1_quantize_fp sse2 avx2 neon/; @@ -346,7 +355,7 @@ #fwd txfm add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param"; - specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/; + specialize qw/av1_lowbd_fwd_txfm sse4_1 avx2 neon/, $sse2_x86; add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/; @@ -437,9 +446,9 @@ specialize qw/av1_txb_init_levels sse4_1 avx2 neon/; add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; - specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon/; + specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon sve/; add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit"; - specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon/; + specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon sve/; add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N"; specialize qw/av1_wedge_compute_delta_squares sse2 avx2 neon/; @@ -449,7 +458,7 @@ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats"; - specialize qw/av1_compute_stats sse4_1 avx2 neon/; + specialize qw/av1_compute_stats sse4_1 avx2 neon sve/; add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; specialize qw/av1_calc_proj_params sse4_1 avx2 neon/; add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; @@ -459,9 +468,9 @@ add_proto qw/void av1_calc_proj_params_high_bd/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/; add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; - specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/; - add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; - specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/; + specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/; + add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; + specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon sve/; } } @@ -485,6 +494,7 @@ if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") { specialize qw/av1_cnn_convolve_no_maxpool_padding_valid avx2/; } + specialize qw/av1_cnn_convolve_no_maxpool_padding_valid neon/; add_proto qw/void av1_cnn_deconvolve/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride"; add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std"; } @@ -521,29 +531,35 @@ # structs as arguments, which makes the v256 type of the intrinsics # hard to support, so optimizations for this target are disabled. if ($opts{config} !~ /libs-x86-win32-vs.*/) { - specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_find_dir_dual sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_8_0 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_8_1 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_8_2 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_8_3 sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_16_0 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_16_1 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_16_2 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_16_3 sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; } # WARPED_MOTION / GLOBAL_MOTION functions if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; - specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/; + specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/; } +add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col"; +specialize qw/av1_resize_vert_dir sse2 avx2/; + +add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filtered_length, int width2"; +specialize qw/av1_resize_horz_dir sse2 avx2/; + add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/; @@ -584,27 +600,27 @@ add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params"; - specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/; + specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm sve2/; specialize qw/av1_convolve_2d_sr_intrabc neon/; specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/; specialize qw/av1_convolve_x_sr_intrabc neon/; - specialize qw/av1_convolve_y_sr sse2 avx2 neon/; + specialize qw/av1_convolve_y_sr sse2 avx2 neon neon_dotprod neon_i8mm/; specialize qw/av1_convolve_y_sr_intrabc neon/; - specialize qw/av1_convolve_2d_scale sse4_1/; - specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon neon_dotprod neon_i8mm/; + specialize qw/av1_convolve_2d_scale sse4_1 neon neon_dotprod neon_i8mm/; + specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/; specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/; specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/; specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/; if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { - specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/; - specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon/; - specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/; + specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon sve2/; + specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon sve2/; + specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon sve2/; specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/; - specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon/; + specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon sve2/; specialize qw/av1_highbd_convolve_2d_sr_intrabc neon/; - specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon/; + specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon sve2/; specialize qw/av1_highbd_convolve_x_sr_intrabc neon/; - specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon/; + specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon sve2/; specialize qw/av1_highbd_convolve_y_sr_intrabc neon/; specialize qw/av1_highbd_convolve_2d_scale sse4_1 neon/; }
diff --git a/av1/common/blockd.h b/av1/common/blockd.h index e7f1b6b..0cfd1f3 100644 --- a/av1/common/blockd.h +++ b/av1/common/blockd.h
@@ -1142,7 +1142,7 @@ return largest_tx_size; } -static const uint8_t mode_to_angle_map[] = { +static const uint8_t mode_to_angle_map[INTRA_MODES] = { 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, };
diff --git a/av1/common/cdef.c b/av1/common/cdef.c index 12e9545..5cec940 100644 --- a/av1/common/cdef.c +++ b/av1/common/cdef.c
@@ -10,15 +10,19 @@ */ #include <assert.h> -#include <math.h> +#include <stddef.h> #include <string.h> #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" +#include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/cdef.h" #include "av1/common/cdef_block.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/enums.h" #include "av1/common/reconinter.h" #include "av1/common/thread_common.h" @@ -92,7 +96,7 @@ const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { - const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; + const uint8_t *base = &src[src_voffset * (ptrdiff_t)sstride + src_hoffset]; cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); } @@ -101,7 +105,7 @@ int src_hoffset, int sstride, int vsize, int hsize) { const uint16_t *base = - &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; + &CONVERT_TO_SHORTPTR(src)[src_voffset * (ptrdiff_t)sstride + src_hoffset]; cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); } @@ -247,7 +251,8 @@ static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane, uint8_t use_highbitdepth) { - int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset; + ptrdiff_t offset = + (ptrdiff_t)fb_info->dst_stride * fb_info->roffset + fb_info->coffset; if (use_highbitdepth) { av1_cdef_filter_fb( NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride,
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h index e86aa75..5c62201 100644 --- a/av1/common/cdef_block_simd.h +++ b/av1/common/cdef_block_simd.h
@@ -158,9 +158,6 @@ res[0] = v128_ziphi_64(tr1_7, tr1_6); } -// There is a separate Neon implementation of this function, so disable this -// one. -#if !HAVE_NEON int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { int i; @@ -199,7 +196,6 @@ *var >>= 10; return best_dir; } -#endif // Work around compiler out of memory issues with Win32 builds. This issue has // been observed with Visual Studio 2017, 2019, and 2022 (version 17.4). @@ -209,9 +205,6 @@ #define CDEF_INLINE SIMD_INLINE #endif -// There is a separate Neon implementation of these functions, so disable this -// one. -#if !HAVE_NEON // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, unsigned int adjdamp) { @@ -830,7 +823,6 @@ copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); } } -#endif // HAVE_NEON void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride,
diff --git a/av1/common/cfl.c b/av1/common/cfl.c index 0e37d45..bd11c4a 100644 --- a/av1/common/cfl.c +++ b/av1/common/cfl.c
@@ -159,8 +159,9 @@ CFL_PREDICT_FN(c, lbd) #if CONFIG_AV1_HIGHBITDEPTH -void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, - int alpha_q3, int bit_depth, int width, int height) { +static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, + int bit_depth, int width, int height) { for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { dst[i] = clip_pixel_highbd(
diff --git a/av1/common/cfl.h b/av1/common/cfl.h index dcaa87b..dbb94d6 100644 --- a/av1/common/cfl.h +++ b/av1/common/cfl.h
@@ -96,6 +96,8 @@ // goodness. #define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ + const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3); \ + void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ output_q3, width, height); \ @@ -171,6 +173,8 @@ // goodness. #define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ + int16_t *dst); \ + void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ int16_t *dst) { \ subtract_average_##arch(src, dst, width, height, round_offset, \ num_pel_log2); \ @@ -220,24 +224,23 @@ return sub_avg[tx_size % TX_SIZES_ALL]; \ } -// For VSX SIMD optimization, the C versions of width == 4 subtract are -// faster than the VSX. As such, the VSX code calls the C versions. -void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); -void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); -void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); - -#define CFL_PREDICT_lbd(arch, width, height) \ - void cfl_predict_lbd_##width##x##height##_##arch( \ - const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ - int alpha_q3) { \ - cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ - height); \ +#define CFL_PREDICT_lbd(arch, width, height) \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ + int alpha_q3) { \ + cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ + height); \ } #if CONFIG_AV1_HIGHBITDEPTH #define CFL_PREDICT_hbd(arch, width, height) \ void cfl_predict_hbd_##width##x##height##_##arch( \ const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ + int bd); \ + void cfl_predict_hbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ int bd) { \ cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ height); \
diff --git a/av1/common/debugmodes.c b/av1/common/debugmodes.c index 7e6160f..e67cf04 100644 --- a/av1/common/debugmodes.c +++ b/av1/common/debugmodes.c
@@ -9,17 +9,21 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "av1/common/debugmodes.h" + #include <stdio.h> #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" +#if 0 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { fprintf(f, "%s", str); fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number, cm->show_frame, cm->quant_params.base_qindex); } + /* This function dereferences a pointer to the mbmi structure * and uses the passed in member offset to print out the value of an integer * for each mbmi member value in the mi structure. @@ -87,6 +91,7 @@ fclose(mvs); } +#endif // 0 void av1_print_uncompressed_frame_header(const uint8_t *data, int size, const char *filename) {
diff --git a/av1/common/debugmodes.h b/av1/common/debugmodes.h new file mode 100644 index 0000000..8f3a91c --- /dev/null +++ b/av1/common/debugmodes.h
@@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_DEBUGMODES_H_ +#define AOM_AV1_COMMON_DEBUGMODES_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file); +void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename); +void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename); + +#endif // AOM_AV1_COMMON_DEBUGMODES_H_
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h index 09cd6bd..028bd21 100644 --- a/av1/common/entropymode.h +++ b/av1/common/entropymode.h
@@ -12,6 +12,7 @@ #ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ #define AOM_AV1_COMMON_ENTROPYMODE_H_ +#include "aom_ports/bitops.h" #include "av1/common/entropy.h" #include "av1/common/entropymv.h" #include "av1/common/filter.h" @@ -192,13 +193,7 @@ // Returns (int)ceil(log2(n)). static INLINE int av1_ceil_log2(int n) { if (n < 2) return 0; - int i = 1; - unsigned int p = 2; - while (p < (unsigned int)n) { - i++; - p = p << 1; - } - return i; + return get_msb(n - 1) + 1; } // Returns the context for palette color index at row 'r' and column 'c',
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c index 6f88768..27a7f07 100644 --- a/av1/common/ppc/cfl_ppc.c +++ b/av1/common/ppc/cfl_ppc.c
@@ -124,6 +124,10 @@ // Based on observation, for small blocks VSX does not outperform C (no 64bit // load and store intrinsics). So we call the C code for block widths 4. +extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); +extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); +extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); + cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { cfl_subtract_average_4x4_c, /* 4x4 */
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c index b097628..58eb113 100644 --- a/av1/common/quant_common.c +++ b/av1/common/quant_common.c
@@ -9,10 +9,15 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "config/aom_config.h" + +#include "aom/aom_frame_buffer.h" +#include "aom_scale/yv12config.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/common.h" #include "av1/common/entropy.h" +#include "av1/common/filter.h" #include "av1/common/quant_common.h" #include "av1/common/seg_common.h" @@ -274,13 +279,16 @@ : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; } +#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER #define QM_TOTAL_SIZE 3344 // We only use wt_matrix_ref[q] and iwt_matrix_ref[q] // for q = 0, ..., NUM_QM_LEVELS - 2. static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; +#endif void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { +#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER for (int q = 0; q < NUM_QM_LEVELS; ++q) { for (int c = 0; c < num_planes; ++c) { int current = 0; @@ -306,6 +314,10 @@ } } } +#else + (void)quant_params; + (void)num_planes; +#endif // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER } /* Provide 15 sets of quantization matrices for chroma and luma @@ -320,6 +332,8 @@ distances. Matrices for QM level 15 are omitted because they are not used. */ + +#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { { { /* Luma */ @@ -12873,4 +12887,6 @@ 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32 }, }, -}; \ No newline at end of file +}; + +#endif // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h index 0b93d3b..c31f453 100644 --- a/av1/common/reconinter.h +++ b/av1/common/reconinter.h
@@ -449,7 +449,7 @@ #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) -void av1_init_wedge_masks(); +void av1_init_wedge_masks(void); static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, int8_t wedge_sign,
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c index 67fb13f..497863e 100644 --- a/av1/common/reconintra.c +++ b/av1/common/reconintra.c
@@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> #include <math.h> #include "config/aom_config.h" @@ -959,21 +960,18 @@ } static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) { - int ab_sm, le_sm; + const MB_MODE_INFO *above; + const MB_MODE_INFO *left; if (plane == 0) { - const MB_MODE_INFO *ab = xd->above_mbmi; - const MB_MODE_INFO *le = xd->left_mbmi; - ab_sm = ab ? is_smooth(ab, plane) : 0; - le_sm = le ? is_smooth(le, plane) : 0; + above = xd->above_mbmi; + left = xd->left_mbmi; } else { - const MB_MODE_INFO *ab = xd->chroma_above_mbmi; - const MB_MODE_INFO *le = xd->chroma_left_mbmi; - ab_sm = ab ? is_smooth(ab, plane) : 0; - le_sm = le ? is_smooth(le, plane) : 0; + above = xd->chroma_above_mbmi; + left = xd->chroma_left_mbmi; } - return (ab_sm || le_sm) ? 1 : 0; + return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane)); } static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { @@ -1071,7 +1069,7 @@ } } -static void build_intra_predictors( +static void build_directional_and_filter_intra_predictors( const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, @@ -1090,6 +1088,7 @@ int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + assert(use_filter_intra || is_dr_mode); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to @@ -1190,49 +1189,119 @@ return; } - if (is_dr_mode) { - int upsample_above = 0; - int upsample_left = 0; - if (!disable_edge_filter) { - const int need_right = p_angle < 90; - const int need_bottom = p_angle > 180; - if (p_angle != 90 && p_angle != 180) { - const int ab_le = need_above_left ? 1 : 0; - if (need_above && need_left && (txwpx + txhpx >= 24)) { - filter_intra_edge_corner(above_row, left_col); - } - if (need_above && n_top_px > 0) { - const int strength = intra_edge_filter_strength( - txwpx, txhpx, p_angle - 90, intra_edge_filter_type); - const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); - av1_filter_intra_edge(above_row - ab_le, n_px, strength); - } - if (need_left && n_left_px > 0) { - const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, intra_edge_filter_type); - const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); - av1_filter_intra_edge(left_col - ab_le, n_px, strength); - } + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + assert(need_above_left); + const int ab_le = 1; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + filter_intra_edge_corner(above_row, left_col); } - upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, - intra_edge_filter_type); - if (need_above && upsample_above) { - const int n_px = txwpx + (need_right ? txhpx : 0); - av1_upsample_intra_edge(above_row, n_px); + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_filter_intra_edge(above_row - ab_le, n_px, strength); } - upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, - intra_edge_filter_type); - if (need_left && upsample_left) { - const int n_px = txhpx + (need_bottom ? txwpx : 0); - av1_upsample_intra_edge(left_col, n_px); + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_filter_intra_edge(left_col - ab_le, n_px, strength); } } - dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, - upsample_left, p_angle); + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_upsample_intra_edge(above_row, n_px); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_upsample_intra_edge(left_col, n_px); + } + } + dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, + upsample_left, p_angle); +} + +// This function generates the pred data of a given block for non-directional +// intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, SMOOTH_V and PAETH). +static void build_non_directional_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) { + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int need_left = extend_modes[mode] & NEED_LEFT; + const int need_above = extend_modes[mode] & NEED_ABOVE; + const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + int i = 0; + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; + } return; } - // predict + DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + + if (need_left) { + memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + memset(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i); + } else if (n_left_px > 0) { + memset(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + if (mode == DC_PRED) { dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row, left_col); @@ -1300,7 +1369,7 @@ } } -static void highbd_build_intra_predictors( +static void highbd_build_directional_and_filter_intra_predictors( const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, @@ -1308,7 +1377,7 @@ int bit_depth) { int i; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint16_t *const above_row = above_data + 16; @@ -1322,7 +1391,8 @@ const uint16_t *left_ref = ref - 1; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; - int base = 128 << (bit_depth - 8); + assert(use_filter_intra || is_dr_mode); + const int base = 128 << (bit_depth - 8); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are @@ -1424,49 +1494,125 @@ return; } - if (is_dr_mode) { - int upsample_above = 0; - int upsample_left = 0; - if (!disable_edge_filter) { - const int need_right = p_angle < 90; - const int need_bottom = p_angle > 180; - if (p_angle != 90 && p_angle != 180) { - const int ab_le = need_above_left ? 1 : 0; - if (need_above && need_left && (txwpx + txhpx >= 24)) { - highbd_filter_intra_edge_corner(above_row, left_col); - } - if (need_above && n_top_px > 0) { - const int strength = intra_edge_filter_strength( - txwpx, txhpx, p_angle - 90, intra_edge_filter_type); - const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); - av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); - } - if (need_left && n_left_px > 0) { - const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, intra_edge_filter_type); - const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); - av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); - } + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + assert(need_above_left); + const int ab_le = 1; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + highbd_filter_intra_edge_corner(above_row, left_col); } - upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, - intra_edge_filter_type); - if (need_above && upsample_above) { - const int n_px = txwpx + (need_right ? txhpx : 0); - av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); } - upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, - intra_edge_filter_type); - if (need_left && upsample_left) { - const int n_px = txhpx + (need_bottom ? txwpx : 0); - av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); } } - highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, - upsample_above, upsample_left, p_angle, bit_depth); + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + } + } + highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, + upsample_above, upsample_left, p_angle, bit_depth); +} + +// For HBD encode/decode, this function generates the pred data of a given +// block for non-directional intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, +// SMOOTH_V and PAETH). +static void highbd_build_non_directional_intra_predictors( + const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px, + int bit_depth) { + int i = 0; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const uint16_t *above_ref = ref - ref_stride; + const uint16_t *left_ref = ref - 1; + const int base = 128 << (bit_depth - 8); + + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : base + 1; + } else { + val = (n_left_px > 0) ? left_ref[0] : base - 1; + } + for (i = 0; i < txhpx; ++i) { + aom_memset16(dst, val, txwpx); + dst += dst_stride; + } return; } - // predict + DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint16_t *const above_row = above_data + 16; + uint16_t *const left_col = left_data + 16; + + if (need_left) { + aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + aom_memset16(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); + i = n_top_px; + if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i)); + } else if (n_left_px > 0) { + aom_memset16(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = base; + } + left_col[-1] = above_row[-1]; + } + if (mode == DC_PRED) { dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( dst, dst_stride, above_row, left_col, bit_depth); @@ -1540,6 +1686,9 @@ const int txhpx = tx_size_high[tx_size]; const int x = col_off << MI_SIZE_LOG2; const int y = row_off << MI_SIZE_LOG2; + const int is_hbd = is_cur_buf_hbd(xd); + + assert(mode < INTRA_MODES); if (use_palette) { int r, c; @@ -1547,7 +1696,7 @@ xd->color_index_map_offset[plane != 0]; const uint16_t *const palette = mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; - if (is_cur_buf_hbd(xd)) { + if (is_hbd) { uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (r = 0; r < txhpx; ++r) { for (c = 0; c < txwpx; ++c) { @@ -1566,16 +1715,12 @@ } const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int txw = tx_size_wide_unit[tx_size]; - const int txh = tx_size_high_unit[tx_size]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const int have_top = row_off || (ss_y ? xd->chroma_up_available : xd->up_available); const int have_left = col_off || (ss_x ? xd->chroma_left_available : xd->left_available); - const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); - const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); // Distance between the right edge of this prediction block to // the frame right edge @@ -1583,6 +1728,36 @@ // Distance between the bottom edge of this prediction block to // the frame bottom edge const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx; + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + const int is_dr_mode = av1_is_directional_mode(mode); + + // The computations in this function, as well as in build_intra_predictors(), + // are generalized for all intra modes. Some of these operations are not + // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H, + // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a + // separate function build_non_directional_intra_predictors() is introduced + // for these modes to avoid redundant computations while generating pred data. + + const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0; + const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0; + if (!use_filter_intra && !is_dr_mode) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + highbd_build_non_directional_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px, + xd->bd); + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride, + mode, tx_size, n_top_px, n_left_px); + return; + } + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); const int right_available = mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; const int bottom_available = @@ -1596,8 +1771,6 @@ bsize = scale_chroma_bsize(bsize, ss_x, ss_y); } - const int is_dr_mode = av1_is_directional_mode(mode); - const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; int p_angle = 0; int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT; int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT; @@ -1629,25 +1802,23 @@ const int disable_edge_filter = !enable_intra_edge_filter; const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); + const int n_topright_px = + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right; + const int n_bottomleft_px = + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left; #if CONFIG_AV1_HIGHBITDEPTH - if (is_cur_buf_hbd(xd)) { - highbd_build_intra_predictors( + if (is_hbd) { + highbd_build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, - tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, - intra_edge_filter_type, xd->bd); + tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, + n_bottomleft_px, intra_edge_filter_type, xd->bd); return; } #endif - build_intra_predictors( + build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, - tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, - intra_edge_filter_type); + tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, + n_bottomleft_px, intra_edge_filter_type); } void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
diff --git a/av1/common/resize.c b/av1/common/resize.c index f89f7ca..505fccd 100644 --- a/av1/common/resize.c +++ b/av1/common/resize.c
@@ -18,6 +18,7 @@ #include <string.h> #include "config/aom_config.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" @@ -216,10 +217,6 @@ // Filters for interpolation (full-band) - no filtering for integer pixels #define filteredinterp_filters1000 av1_resize_filter_normative -// Filters for factor of 2 downsampling. -static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; -static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; - static const InterpKernel *choose_interp_filter(int in_length, int out_length) { int out_length16 = out_length * 16; if (out_length16 >= in_length * 16) @@ -316,91 +313,6 @@ } } -static void interpolate_core_double_prec(const double *const input, - int in_length, double *output, - int out_length, - const int16_t *interp_filters, - int interp_taps) { - const int32_t delta = - (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / - out_length; - const int32_t offset = - in_length > out_length - ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + - out_length / 2) / - out_length - : -(((int32_t)(out_length - in_length) - << (RS_SCALE_SUBPEL_BITS - 1)) + - out_length / 2) / - out_length; - double *optr = output; - int x, x1, x2, k, int_pel, sub_pel; - double sum; - int32_t y; - - x = 0; - y = offset + RS_SCALE_EXTRA_OFF; - while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { - x++; - y += delta; - } - x1 = x; - x = out_length - 1; - y = delta * x + offset + RS_SCALE_EXTRA_OFF; - while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= - in_length) { - x--; - y -= delta; - } - x2 = x; - if (x1 > x2) { - for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; - ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) { - const int pk = int_pel - interp_taps / 2 + 1 + k; - sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; - } - *optr++ = sum / (1 << FILTER_BITS); - } - } else { - // Initial part. - for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) - sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; - *optr++ = sum / (1 << FILTER_BITS); - } - // Middle part. - for (; x <= x2; ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) - sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; - *optr++ = sum / (1 << FILTER_BITS); - } - // End part. - for (; x < out_length; ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) - sum += filter[k] * - input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; - *optr++ = sum / (1 << FILTER_BITS); - } - } -} - static void interpolate(const uint8_t *const input, int in_length, uint8_t *output, int out_length) { const InterpKernel *interp_filters = @@ -410,15 +322,6 @@ SUBPEL_TAPS); } -static void interpolate_double_prec(const double *const input, int in_length, - double *output, int out_length) { - const InterpKernel *interp_filters = - choose_interp_filter(in_length, out_length); - - interpolate_core_double_prec(input, in_length, output, out_length, - &interp_filters[0][0], SUBPEL_TAPS); -} - int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; } @@ -434,8 +337,8 @@ return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK); } -static void down2_symeven(const uint8_t *const input, int length, - uint8_t *output) { +void down2_symeven(const uint8_t *const input, int length, uint8_t *output, + int start_offset) { // Actual filter len = 2 * filter_len_half. const int16_t *filter = av1_down2_symeven_half_filter; const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; @@ -447,7 +350,7 @@ l2 += (l2 & 1); if (l1 > l2) { // Short input length. - for (i = 0; i < length; i += 2) { + for (i = start_offset; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += @@ -459,7 +362,7 @@ } } else { // Initial part. - for (i = 0; i < l1; i += 2) { + for (i = start_offset; i < l1; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j]; @@ -589,7 +492,7 @@ if (filteredlength & 1) down2_symodd(in, filteredlength, out); else - down2_symeven(in, filteredlength, out); + down2_symeven(in, filteredlength, out, 0); filteredlength = proj_filteredlength; } if (filteredlength != olength) { @@ -600,12 +503,6 @@ } } -static void upscale_multistep_double_prec(const double *const input, int length, - double *output, int olength) { - assert(length < olength); - interpolate_double_prec(input, length, output, olength); -} - static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { int i; uint8_t *iptr = img; @@ -624,27 +521,62 @@ } } -static void fill_col_to_arr_double_prec(double *img, int stride, int len, - double *arr) { - int i; - double *iptr = img; - double *aptr = arr; - for (i = 0; i < len; ++i, iptr += stride) { - *aptr++ = *iptr; +bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int width2, int start_col) { + bool mem_status = true; + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2); + if (arrbuf == NULL || arrbuf2 == NULL) { + mem_status = false; + goto Error; } + + for (int i = start_col; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + down2_symeven(arrbuf, height, arrbuf2, 0); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(arrbuf); + aom_free(arrbuf2); + return mem_status; } -static void fill_arr_to_col_double_prec(double *img, int stride, int len, - double *arr) { - int i; - double *iptr = img; - double *aptr = arr; - for (i = 0; i < len; ++i, iptr += stride) { - *iptr = *aptr++; - } +void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, + uint8_t *intbuf, int height, int filtered_length, + int width2) { + for (int i = 0; i < height; ++i) + down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i, + 0); } -bool av1_resize_plane(const uint8_t *const input, int height, int width, +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride) { + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height); + if (intbuf == NULL) { + return false; + } + + // Resize in the horizontal direction + av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2); + // Resize in the vertical direction + bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height, + height2, width2, 0 /*start_col*/); + aom_free(intbuf); + return mem_status; +} + +// Check if both the output width and height are half of input width and +// height respectively. +bool should_resize_by_half(int height, int width, int height2, int width2) { + const bool is_width_by_2 = get_down2_length(width, 1) == width2; + const bool is_height_by_2 = get_down2_length(height, 1) == height2; + return (is_width_by_2 && is_height_by_2); +} + +bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { int i; @@ -679,38 +611,6 @@ return mem_status; } -bool av1_upscale_plane_double_prec(const double *const input, int height, - int width, int in_stride, double *output, - int height2, int width2, int out_stride) { - int i; - bool mem_status = true; - double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height); - double *arrbuf = (double *)aom_malloc(sizeof(double) * height); - double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2); - if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) { - mem_status = false; - goto Error; - } - assert(width > 0); - assert(height > 0); - assert(width2 > 0); - assert(height2 > 0); - for (i = 0; i < height; ++i) - upscale_multistep_double_prec(input + in_stride * i, width, - intbuf + width2 * i, width2); - for (i = 0; i < width2; ++i) { - fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf); - upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2); - fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2); - } - -Error: - aom_free(intbuf); - aom_free(arrbuf); - aom_free(arrbuf2); - return mem_status; -} - static bool upscale_normative_rect(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, @@ -1033,7 +933,7 @@ } } -void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, +void av1_highbd_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int bd) { int i; @@ -1132,10 +1032,9 @@ } #endif // CONFIG_AV1_HIGHBITDEPTH -void av1_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +void av1_resize_frame420(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride)) @@ -1148,10 +1047,9 @@ abort(); } -bool av1_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame422(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride)) @@ -1165,10 +1063,9 @@ return true; } -bool av1_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame444(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride)) @@ -1183,8 +1080,8 @@ } #if CONFIG_AV1_HIGHBITDEPTH -void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame420(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -1197,8 +1094,8 @@ owidth / 2, ouv_stride, bd); } -void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame422(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -1211,8 +1108,8 @@ owidth / 2, ouv_stride, bd); } -void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame444(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -1247,9 +1144,11 @@ uint8_t *dst_buffer = dst->buffers[i]; const int dst_stride = dst->strides[is_uv]; for (int y = 0; y < dst_h; y += 16) { - const int y_q4 = y * 16 * src_h / dst_h + phase_scaler; + const int y_q4 = + src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler; for (int x = 0; x < dst_w; x += 16) { - const int x_q4 = x * 16 * src_w / dst_w + phase_scaler; + const int x_q4 = + src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w; uint8_t *dst_ptr = dst_buffer + y * dst_stride + x; @@ -1276,7 +1175,7 @@ bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, - const int num_planes) { + int num_planes) { // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet @@ -1396,8 +1295,7 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, const InterpFilter filter, const int phase, const bool use_optimized_scaler, - const bool for_psnr, const int border_in_pixels, - const int num_pyramid_levels) { + const bool for_psnr, const int border_in_pixels, const bool alloc_pyramid) { // If scaling is performed for the sole purpose of calculating PSNR, then our // target dimensions are superres upscaled width/height. Otherwise our target // dimensions are coded width/height. @@ -1417,7 +1315,7 @@ scaled, scaled_width, scaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, - num_pyramid_levels, 0)) + alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled buffer"); @@ -1513,7 +1411,7 @@ // TODO(afergs): aom_ vs av1_ functions? Which can I use? // Upscale decoded image. void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, - int num_pyramid_levels) { + bool alloc_pyramid) { const int num_planes = av1_num_planes(cm); if (!av1_superres_scaled(cm)) return; const SequenceHeader *const seq_params = cm->seq_params; @@ -1528,7 +1426,7 @@ if (aom_alloc_frame_buffer( ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, byte_alignment, 0, 0)) + AOM_BORDER_IN_PIXELS, byte_alignment, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate copy buffer for superres upscaling"); @@ -1561,7 +1459,7 @@ cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, - num_pyramid_levels, 0)) { + alloc_pyramid, 0)) { unlock_buffer_pool(pool); aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, @@ -1578,7 +1476,7 @@ frame_to_show, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, byte_alignment, num_pyramid_levels, 0)) + AOM_BORDER_IN_PIXELS, byte_alignment, alloc_pyramid, 0)) aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate current frame buffer for superres upscaling");
diff --git a/av1/common/resize.h b/av1/common/resize.h index d1fab82..6b233f8 100644 --- a/av1/common/resize.h +++ b/av1/common/resize.h
@@ -20,47 +20,45 @@ extern "C" { #endif -bool av1_resize_plane(const uint8_t *const input, int height, int width, +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + +bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); -bool av1_upscale_plane_double_prec(const double *const input, int height, - int width, int in_stride, double *output, - int height2, int width2, int out_stride); // TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from // av1/exports_com and delete this function. -void av1_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +void av1_resize_frame420(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth); -bool av1_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame422(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth); -bool av1_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame444(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth); -void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, +void av1_highbd_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int bd); -void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame420(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd); -void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame422(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd); -void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame444(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -76,12 +74,11 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, const InterpFilter filter, const int phase, const bool use_optimized_scaler, - const bool for_psnr, const int border_in_pixels, - const int num_pyramid_levels); + const bool for_psnr, const int border_in_pixels, const bool alloc_pyramid); bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, - const int num_planes); + int num_planes); // Calculates the scaled dimensions from the given original dimensions and the // resize scale denominator. @@ -98,7 +95,16 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, - int num_pyramid_levels); + bool alloc_pyramid); + +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride); + +void down2_symeven(const uint8_t *const input, int length, uint8_t *output, + int start_offset); + +bool should_resize_by_half(int height, int width, int height2, int width2); // Returns 1 if a superres upscaled frame is scaled and 0 otherwise. static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
diff --git a/av1/common/restoration.c b/av1/common/restoration.c index a26f329..335fdc8 100644 --- a/av1/common/restoration.c +++ b/av1/common/restoration.c
@@ -11,20 +11,24 @@ */ #include <math.h> +#include <stddef.h> #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" +#include "aom/internal/aom_codec_internal.h" #include "aom_mem/aom_mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_util/aom_pthread.h" + #include "av1/common/av1_common_int.h" +#include "av1/common/convolve.h" +#include "av1/common/enums.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "av1/common/thread_common.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_mem/aom_mem.h" - -#include "aom_ports/mem.h" // The 's' values are calculated based on original 'r' and 'e' values in the // spec using GenSgrprojVtable(). @@ -90,7 +94,7 @@ // Index 1 corresponds to r[1], e[1] int sgrproj_mtable[SGRPROJ_PARAMS][2]; -static void GenSgrprojVtable() { +static void GenSgrprojVtable(void) { for (int i = 0; i < SGRPROJ_PARAMS; ++i) { const sgr_params_type *const params = &av1_sgr_params[i]; for (int j = 0; j < 2; ++j) { @@ -109,14 +113,15 @@ } #endif -void av1_loop_restoration_precal() { +void av1_loop_restoration_precal(void) { #if 0 GenSgrprojVtable(); #endif } -static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, - int border_horz, int border_vert) { +static void extend_frame_lowbd(uint8_t *data, int width, int height, + ptrdiff_t stride, int border_horz, + int border_vert) { uint8_t *data_p; int i; for (i = 0; i < height; ++i) { @@ -136,7 +141,8 @@ #if CONFIG_AV1_HIGHBITDEPTH static void extend_frame_highbd(uint16_t *data, int width, int height, - int stride, int border_horz, int border_vert) { + ptrdiff_t stride, int border_horz, + int border_vert) { uint16_t *data_p; int i, j; for (i = 0; i < height; ++i) { @@ -988,8 +994,10 @@ int unit_h = limits->v_end - limits->v_start; int unit_w = limits->h_end - limits->h_start; - uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start; - uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start; + uint8_t *data8_tl = + data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start; + uint8_t *dst8_tl = + dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start; if (unit_rtype == RESTORE_NONE) { copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, @@ -1074,7 +1082,8 @@ if (aom_realloc_frame_buffer( lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) != AOM_CODEC_OK) + cm->features.byte_alignment, NULL, NULL, NULL, false, + 0) != AOM_CODEC_OK) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); @@ -1349,7 +1358,7 @@ const int is_uv = plane > 0; const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); const int src_stride = frame->strides[is_uv] << use_highbd; - const uint8_t *src_rows = src_buf + row * src_stride; + const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride; uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below; @@ -1404,7 +1413,7 @@ const int is_uv = plane > 0; const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); const int src_stride = frame->strides[is_uv] << use_highbd; - const uint8_t *src_rows = src_buf + row * src_stride; + const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride; uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below;
diff --git a/av1/common/restoration.h b/av1/common/restoration.h index d5da81d..644e069 100644 --- a/av1/common/restoration.h +++ b/av1/common/restoration.h
@@ -410,7 +410,7 @@ void *lr_ctxt); /*!\cond */ -void av1_loop_restoration_precal(); +void av1_loop_restoration_precal(void); struct AV1LrSyncData;
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c index 8a6f290..8a137cc 100644 --- a/av1/common/thread_common.c +++ b/av1/common/thread_common.c
@@ -14,12 +14,19 @@ #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" +#include "aom_util/aom_pthread.h" +#include "aom_util/aom_thread.h" #include "av1/common/av1_loopfilter.h" +#include "av1/common/blockd.h" +#include "av1/common/cdef.h" #include "av1/common/entropymode.h" +#include "av1/common/enums.h" #include "av1/common/thread_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" +#include "av1/common/restoration.h" // Set up nsync by width. static INLINE int get_sync_range(int width) { @@ -57,7 +64,6 @@ void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, int width, int num_workers) { lf_sync->rows = rows; - lf_sync->lf_mt_exit = false; #if CONFIG_MULTITHREAD { int i, j; @@ -234,7 +240,12 @@ if (sig) { pthread_mutex_lock(&lf_sync->mutex_[plane][r]); - lf_sync->cur_sb_col[plane][r] = cur; + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur); pthread_cond_broadcast(&lf_sync->cond_[plane][r]); pthread_mutex_unlock(&lf_sync->mutex_[plane][r]); @@ -373,9 +384,7 @@ error_info = ((LFWorkerData *)worker->data2)->error_info; } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); } // Row-based multi-threaded loopfilter hook @@ -551,7 +560,13 @@ if (sig) { pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]); - loop_res_sync->cur_sb_col[plane][r] = cur; + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + loop_res_sync->cur_sb_col[plane][r] = + AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur); pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]); pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]); @@ -601,7 +616,8 @@ } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata, - aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata)))); + aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata)))); + lr_sync->num_workers = num_workers; for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { if (worker_idx < num_workers - 1) { @@ -616,9 +632,6 @@ } } - lr_sync->num_workers = num_workers; - lr_sync->lr_mt_exit = false; - for (int j = 0; j < num_planes; j++) { CHECK_MEM_ERROR( cm, lr_sync->cur_sb_col[j], @@ -898,9 +911,7 @@ error_info = ((LRWorkerData *)worker->data2)->error_info; } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); } static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, @@ -932,6 +943,7 @@ av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes, cm->width); } + lr_sync->lr_mt_exit = false; // Initialize cur_sb_col to -1 for all SB rows. for (i = 0; i < num_planes; i++) { @@ -985,6 +997,7 @@ cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; } static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers, @@ -1021,9 +1034,7 @@ error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); } // Updates the row index of the next job to be processed.
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h index 6d695e8..7e681f3 100644 --- a/av1/common/thread_common.h +++ b/av1/common/thread_common.h
@@ -16,6 +16,7 @@ #include "av1/common/av1_loopfilter.h" #include "av1/common/cdef.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #ifdef __cplusplus @@ -269,6 +270,7 @@ av1_loop_filter_dealloc(lf_sync); av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } + lf_sync->lf_mt_exit = false; // Initialize cur_sb_col to -1 for all SB rows. for (int i = 0; i < MAX_MB_PLANE; i++) { @@ -314,15 +316,21 @@ } } -static AOM_INLINE int check_planes_to_loop_filter(const struct loopfilter *lf, - int *planes_to_lf, - int plane_start, - int plane_end) { +static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf, + int planes_to_lf[MAX_MB_PLANE], + int plane_start, + int plane_end) { // For each luma and chroma plane, whether to filter it or not. planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) && plane_start <= 0 && 0 < plane_end; planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end; planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end; +} + +static AOM_INLINE int check_planes_to_loop_filter( + const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE], + int plane_start, int plane_end) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); // If the luma plane is purposely not filtered, neither are the chroma // planes. if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0;
diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c index b964f25..45a189d 100644 --- a/av1/common/tile_common.c +++ b/av1/common/tile_common.c
@@ -177,46 +177,16 @@ cm->seq_params->mib_size_log2); } -PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, - int is_uv) { - PixelRect r; - - // Calculate position in the Y plane - r.left = tile_info->mi_col_start * MI_SIZE; - r.right = tile_info->mi_col_end * MI_SIZE; - r.top = tile_info->mi_row_start * MI_SIZE; - r.bottom = tile_info->mi_row_end * MI_SIZE; - - // If upscaling is enabled, the tile limits need scaling to match the - // upscaled frame where the restoration units live. To do this, scale up the - // top-left and bottom-right of the tile. - if (av1_superres_scaled(cm)) { - av1_calculate_unscaled_superres_size(&r.left, &r.top, - cm->superres_scale_denominator); - av1_calculate_unscaled_superres_size(&r.right, &r.bottom, - cm->superres_scale_denominator); - } - - const int frame_w = cm->superres_upscaled_width; - const int frame_h = cm->superres_upscaled_height; - - // Make sure we don't fall off the bottom-right of the frame. - r.right = AOMMIN(r.right, frame_w); - r.bottom = AOMMIN(r.bottom, frame_h); - - // Convert to coordinates in the appropriate plane - const int ss_x = is_uv && cm->seq_params->subsampling_x; - const int ss_y = is_uv && cm->seq_params->subsampling_y; - - r.left = ROUND_POWER_OF_TWO(r.left, ss_x); - r.right = ROUND_POWER_OF_TWO(r.right, ss_x); - r.top = ROUND_POWER_OF_TWO(r.top, ss_y); - r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y); - - return r; -} - -void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { +// Section 7.3.1 of the AV1 spec says, on pages 200-201: +// It is a requirement of bitstream conformance that the following conditions +// are met: +// ... +// * TileHeight is equal to (use_128x128_superblock ? 128 : 64) for all +// tiles (i.e. the tile is exactly one superblock high) +// * TileWidth is identical for all tiles and is an integer multiple of +// TileHeight (i.e. the tile is an integer number of superblocks wide) +// ... +bool av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { const CommonTileParams *const tiles = &cm->tiles; if (tiles->uniform_spacing) { *w = tiles->width; @@ -226,7 +196,10 @@ const int tile_width_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; const int tile_w = tile_width_sb * cm->seq_params->mib_size; - assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + // ensure all tiles have same dimension + if (i != 0 && tile_w != *w) { + return false; + } *w = tile_w; } @@ -234,10 +207,14 @@ const int tile_height_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; const int tile_h = tile_height_sb * cm->seq_params->mib_size; - assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + // ensure all tiles have same dimension + if (i != 0 && tile_h != *h) { + return false; + } *h = tile_h; } } + return true; } int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h index 5383ae9..12228c9 100644 --- a/av1/common/tile_common.h +++ b/av1/common/tile_common.h
@@ -12,13 +12,14 @@ #ifndef AOM_AV1_COMMON_TILE_COMMON_H_ #define AOM_AV1_COMMON_TILE_COMMON_H_ +#include <stdbool.h> + +#include "config/aom_config.h" + #ifdef __cplusplus extern "C" { #endif -#include "config/aom_config.h" -#include "aom_dsp/rect.h" - struct AV1Common; struct SequenceHeader; struct CommonTileParams; @@ -43,10 +44,6 @@ int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile); int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile); -// Return the pixel extents of the given tile -PixelRect av1_get_tile_rect(const TileInfo *tile_info, - const struct AV1Common *cm, int is_uv); - // Define tile maximum width and area // There is no maximum height since height is limited by area and width limits // The minimum tile width or height is fixed at one superblock @@ -56,7 +53,9 @@ #define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608) #endif -void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); +// Gets the width and height (in units of MI_SIZE) of the tiles in a tile list. +// Returns true on success, false on failure. +bool av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); void av1_get_tile_limits(struct AV1Common *const cm); void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, int cm_mi_rows, int cm_mi_cols,
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c index f376e16..4282b92 100644 --- a/av1/common/warped_motion.c +++ b/av1/common/warped_motion.c
@@ -291,9 +291,7 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; @@ -306,6 +304,10 @@ (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + for (int i = p_row; i < p_row + p_height; i += 8) { for (int j = p_col; j < p_col + p_width; j += 8) { // Calculate the center of this 8x8 block,
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c index 67b28bc..8e293b5 100644 --- a/av1/common/x86/av1_convolve_scale_sse4.c +++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -12,7 +12,7 @@ #include <assert.h> #include <smmintrin.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/cdef_block_sse2.c b/av1/common/x86/cdef_block_sse2.c deleted file mode 100644 index 5ab7ffa..0000000 --- a/av1/common/x86/cdef_block_sse2.c +++ /dev/null
@@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/aom_simd.h" -#define SIMD_FUNC(name) name##_sse2 -#include "av1/common/cdef_block_simd.h" - -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, - int stride, int32_t *var_out_1st, - int32_t *var_out_2nd, int coeff_shift, - int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { - // Process first 8x8. - *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); - - // Process second 8x8. - *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); -} - -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, - const uint8_t *src, int sstride, - int width, int height) { - int j = 0; - for (int i = 0; i < height; i++) { - for (j = 0; j < (width & ~0x7); j += 8) { - v64 row = v64_load_unaligned(&src[i * sstride + j]); - v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); - } - for (; j < width; j++) { - dst[i * dstride + j] = src[i * sstride + j]; - } - } -}
diff --git a/av1/common/x86/cdef_block_ssse3.c b/av1/common/x86/cdef_block_ssse3.c index 0fb36eb..14eb6c9 100644 --- a/av1/common/x86/cdef_block_ssse3.c +++ b/av1/common/x86/cdef_block_ssse3.c
@@ -9,6 +9,17 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +// Include SSSE3 CDEF code only for 32-bit x86, to support Valgrind. +// For normal use, we require SSE4.1, so cdef_*_sse4_1 will be used instead of +// these functions. However, 32-bit Valgrind does not support SSE4.1, so we +// include a fallback to SSSE3 to improve performance + +#include "config/aom_config.h" + +#if !AOM_ARCH_X86 +#error "cdef_block_ssse3.c is included for compatibility with 32-bit x86 only" +#endif // !AOM_ARCH_X86 + #include "aom_dsp/aom_simd.h" #define SIMD_FUNC(name) name##_ssse3 #include "av1/common/cdef_block_simd.h"
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c index 1b39a0a..d4c1169 100644 --- a/av1/common/x86/convolve_2d_avx2.c +++ b/av1/common/x86/convolve_2d_avx2.c
@@ -21,13 +21,11 @@ #include "av1/common/convolve.h" -void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_qn, - const int subpel_y_qn, - ConvolveParams *conv_params) { +static void convolve_2d_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { if (filter_params_x->taps > 8) { const int bd = 8; int im_stride = 8, i; @@ -150,9 +148,9 @@ const bool use_general = (tap_x == 12 || tap_y == 12); if (use_general) { - av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, - subpel_x_q4, subpel_y_q4, conv_params); + convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_q4, + subpel_y_q4, conv_params); } else { av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y,
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c index 1b85f37..68971ea 100644 --- a/av1/common/x86/convolve_2d_sse2.c +++ b/av1/common/x86/convolve_2d_sse2.c
@@ -19,12 +19,11 @@ #include "aom_dsp/x86/convolve_common_intrin.h" #include "av1/common/convolve.h" -void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int subpel_y_qn, - ConvolveParams *conv_params) { +static void convolve_2d_sr_12tap_sse2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; DECLARE_ALIGNED(16, int16_t, @@ -231,9 +230,9 @@ filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } else { - av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, - subpel_x_qn, subpel_y_qn, conv_params); + convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); } } else { const int bd = 8;
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c index 012e75c..9272e91 100644 --- a/av1/common/x86/convolve_sse2.c +++ b/av1/common/x86/convolve_sse2.c
@@ -16,6 +16,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/synonyms.h" #include "av1/common/convolve.h" static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, @@ -75,10 +76,10 @@ return convolve(ss, coeffs); } -void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_y, - int subpel_y_qn) { +static void convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + int subpel_y_qn) { const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *src_ptr = src - fo_vert * src_stride; const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); @@ -185,8 +186,8 @@ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); } else { - av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, - filter_params_y, subpel_y_qn); + convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); } } else { const int fo_vert = filter_params_y->taps / 2 - 1; @@ -200,31 +201,23 @@ if (w <= 4) { __m128i s[8], src6, res, res_round, res16; int res_int; - src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride)); - s[0] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride))); - s[1] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride))); - s[2] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride))); - s[3] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride))); - s[4] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride))); - s[5] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6); + s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride), + xx_loadl_32(src_ptr + 1 * src_stride)); + s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride), + xx_loadl_32(src_ptr + 2 * src_stride)); + s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride), + xx_loadl_32(src_ptr + 3 * src_stride)); + s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride), + xx_loadl_32(src_ptr + 4 * src_stride)); + s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride), + xx_loadl_32(src_ptr + 5 * src_stride)); + src6 = xx_loadl_32(src_ptr + 6 * src_stride); + s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6); do { - s[6] = _mm_unpacklo_epi8( - src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride))); - src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride)); - s[7] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6); + s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride)); + src6 = xx_loadl_32(src_ptr + 8 * src_stride); + s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6); res = convolve_lo_y(s + 0, coeffs); res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); @@ -337,11 +330,11 @@ } } -void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - int subpel_x_qn, - ConvolveParams *conv_params) { +static void convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + int subpel_x_qn, + ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_0; @@ -402,8 +395,8 @@ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else { - av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, - filter_params_x, subpel_x_qn, conv_params); + convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); } } else { const int fo_horiz = filter_params_x->taps / 2 - 1;
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c index de850ee..d65318c 100644 --- a/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -12,7 +12,7 @@ #include <immintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h"
diff --git a/av1/common/x86/highbd_convolve_2d_sse4.c b/av1/common/x86/highbd_convolve_2d_sse4.c index b2c39cd..89d7199 100644 --- a/av1/common/x86/highbd_convolve_2d_sse4.c +++ b/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -13,7 +13,7 @@ #include <smmintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/highbd_convolve_2d_ssse3.c b/av1/common/x86/highbd_convolve_2d_ssse3.c index 8324044..88974ba 100644 --- a/av1/common/x86/highbd_convolve_2d_ssse3.c +++ b/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -12,7 +12,7 @@ #include <tmmintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c index da52ecd..6dcac10 100644 --- a/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -12,7 +12,7 @@ #include <immintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h"
diff --git a/av1/common/x86/highbd_jnt_convolve_sse4.c b/av1/common/x86/highbd_jnt_convolve_sse4.c index af45764..5a7fc53 100644 --- a/av1/common/x86/highbd_jnt_convolve_sse4.c +++ b/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -12,7 +12,7 @@ #include <smmintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_sse4_1.h"
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c index 7f6aceb..75108b4 100644 --- a/av1/common/x86/highbd_warp_affine_avx2.c +++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -22,9 +22,7 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m256i tmp[15]; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; @@ -37,6 +35,10 @@ (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c index 9df0ddc..96fb4cf 100644 --- a/av1/common/x86/highbd_warp_plane_sse4.c +++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -302,9 +302,7 @@ int16_t beta, int16_t gamma, int16_t delta) { __m128i tmp[15]; int i, j, k; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; @@ -313,6 +311,10 @@ assert(!(bd == 12 && reduce_bits_horiz < 5)); assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const __m128i clip_pixel = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
diff --git a/av1/common/x86/highbd_wiener_convolve_avx2.c b/av1/common/x86/highbd_wiener_convolve_avx2.c index ea8b35b..562c623 100644 --- a/av1/common/x86/highbd_wiener_convolve_avx2.c +++ b/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -12,7 +12,7 @@ #include <immintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h"
diff --git a/av1/common/x86/highbd_wiener_convolve_ssse3.c b/av1/common/x86/highbd_wiener_convolve_ssse3.c index 1c88474..cab37fa 100644 --- a/av1/common/x86/highbd_wiener_convolve_ssse3.c +++ b/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -12,7 +12,7 @@ #include <tmmintrin.h> #include <assert.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h"
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c index ae8f88e..9f82ed2 100644 --- a/av1/common/x86/jnt_convolve_avx2.c +++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -12,7 +12,7 @@ #include <emmintrin.h> #include <immintrin.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c index ab937f9..6b12278 100644 --- a/av1/common/x86/jnt_convolve_sse2.c +++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -11,10 +11,11 @@ #include <emmintrin.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/synonyms.h" void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, @@ -178,31 +179,23 @@ if (w == 4) { __m128i s[8], src6, res, res_shift; - src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride)); - s[0] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride))); - s[1] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride))); - s[2] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride))); - s[3] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride))); - s[4] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)), - _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride))); - s[5] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6); + s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride), + xx_loadl_32(src_ptr + 1 * src_stride)); + s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride), + xx_loadl_32(src_ptr + 2 * src_stride)); + s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride), + xx_loadl_32(src_ptr + 3 * src_stride)); + s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride), + xx_loadl_32(src_ptr + 4 * src_stride)); + s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride), + xx_loadl_32(src_ptr + 5 * src_stride)); + src6 = xx_loadl_32(src_ptr + 6 * src_stride); + s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6); do { - s[6] = _mm_unpacklo_epi8( - src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride))); - src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride)); - s[7] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6); + s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride)); + src6 = xx_loadl_32(src_ptr + 8 * src_stride); + s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6); res = convolve_lo_y(s + 0, coeffs); res_shift = _mm_sll_epi32(res, left_shift); @@ -375,232 +368,3 @@ } while (j < w); } } - -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int subpel_y_qn, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int bd = 8; - - DECLARE_ALIGNED(16, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m128i zero = _mm_setzero_si128(); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_qn & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - __m128i temp_lo, temp_hi; - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - const __m128i src_lo = _mm_unpacklo_epi8(data, zero); - const __m128i src_hi = _mm_unpackhi_epi8(data, zero); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); - temp_lo = _mm_srli_si128(src_lo, 4); - temp_hi = _mm_slli_si128(src_hi, 12); - const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - temp_lo = _mm_srli_si128(src_lo, 8); - temp_hi = _mm_slli_si128(src_hi, 8); - const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - temp_lo = _mm_srli_si128(src_lo, 12); - temp_hi = _mm_slli_si128(src_hi, 4); - const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - temp_lo = _mm_srli_si128(src_lo, 2); - temp_hi = _mm_slli_si128(src_hi, 14); - const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - temp_lo = _mm_srli_si128(src_lo, 6); - temp_hi = _mm_slli_si128(src_hi, 10); - const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - temp_lo = _mm_srli_si128(src_lo, 10); - temp_hi = _mm_slli_si128(src_hi, 6); - const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - temp_lo = _mm_srli_si128(src_lo, 14); - temp_hi = _mm_slli_si128(src_hi, 2); - const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_qn & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); - const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - - if (w > 4) - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); - else - *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); - } else { - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); - } - } - } - } -}
diff --git a/av1/common/x86/jnt_convolve_ssse3.c b/av1/common/x86/jnt_convolve_ssse3.c index d0cf763..f6bf678 100644 --- a/av1/common/x86/jnt_convolve_ssse3.c +++ b/av1/common/x86/jnt_convolve_ssse3.c
@@ -11,7 +11,7 @@ #include <tmmintrin.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c index 71fab7a..4bc5aa4 100644 --- a/av1/common/x86/reconinter_avx2.c +++ b/av1/common/x86/reconinter_avx2.c
@@ -576,7 +576,7 @@ } } } else { - const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2); if (mask_type == DIFFWTD_38_INV) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) {
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c index 95814b4..eb4a4d1 100644 --- a/av1/common/x86/reconinter_sse4.c +++ b/av1/common/x86/reconinter_sse4.c
@@ -15,6 +15,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "av1/common/blockd.h" +#include "config/av1_rtcd.h" static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0, const __m128i s1) {
diff --git a/av1/common/x86/reconinter_ssse3.c b/av1/common/x86/reconinter_ssse3.c index c9a3709..b177958 100644 --- a/av1/common/x86/reconinter_ssse3.c +++ b/av1/common/x86/reconinter_ssse3.c
@@ -76,7 +76,7 @@ } } } else { - const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2); if (mask_type == DIFFWTD_38_INV) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 8) {
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c new file mode 100644 index 0000000..9c8958e --- /dev/null +++ b/av1/common/x86/resize_avx2.c
@@ -0,0 +1,744 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> +#include <string.h> + +#include "config/av1_rtcd.h" + +#include "av1/common/resize.h" + +#include "aom_dsp/x86/synonyms.h" + +#define ROW_OFFSET 5 +#define CAST_HI(x) _mm256_castsi128_si256(x) +#define CAST_LOW(x) _mm256_castsi256_si128(x) + +#define PROCESS_RESIZE_Y_WD16 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \ + \ + /* g0... g15 | i0... i15 */ \ + const __m256i s68 = \ + _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \ + /* h0... h15 | j0... j15 */ \ + const __m256i s79 = \ + _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \ + \ + /* g0h0... g7g7 | i0j0... i7j */ \ + s[3] = _mm256_unpacklo_epi8(s68, s79); \ + /* g8h8... g15g15 | i8j8... i15j15 */ \ + s[8] = _mm256_unpackhi_epi8(s68, s79); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20... r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + __m256i res_out_b[2] = { 0 }; \ + resize_convolve(s + 5, coeffs_y, res_out_b); \ + \ + /* r08... r015 */ \ + __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \ + /* r28... r215 */ \ + __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \ + res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \ + res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \ + \ + /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \ + __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \ + __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \ + /* r00... r07 | r20... r27 */ \ + res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \ + /* r08... r015 | r28... r215 */ \ + res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \ + /* r00... r015 | r20... r215 */ \ + res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \ + res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \ + res_8bit0 = _mm256_max_epu8(res_8bit0, zero); + +#define PROCESS_RESIZE_Y_WD8 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \ + \ + /* g0h0... g7h7 */ \ + s67 = _mm_unpacklo_epi8(l6, l7); \ + /* i0j0...i7j7 */ \ + __m128i s89 = _mm_unpacklo_epi8(l8, l9); \ + \ + /* g0h0...g7g7 | i0j0...i7j7 */ \ + s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20...r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \ + res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r00...r07 | r20...r27 */ \ + res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \ + res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \ + res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \ + res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero); + +#define PROCESS_RESIZE_X_WD32 \ + /* a0 a1 ..... a30 a31 */ \ + __m256i row0 = _mm256_loadu_si256( \ + (__m256i *)&input[i * in_stride + j - filter_offset]); \ + /* b0 b1 ..... b30 b31 */ \ + __m256i row1 = _mm256_loadu_si256( \ + (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]); \ + /* a0 .... a15 || b0.... b15 */ \ + __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); \ + /* a16 .... a31 || b16 .... b31 */ \ + __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); \ + filter_offset = 3; \ + \ + /* Pad start pixels to the left, while processing the first pixels in the \ + * row. */ \ + if (j == 0) { \ + /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \ + row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \ + /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */ \ + row1 = _mm256_alignr_epi8(r1, r0, 13); \ + r0 = row0; \ + r1 = row1; \ + } \ + const int is_last_cols32 = (j + 32 == filtered_length); \ + /* Avoid loading extra pixels at frame boundary.*/ \ + if (is_last_cols32) row_offset = ROW_OFFSET; \ + /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \ + __m128i row0_0 = _mm_loadl_epi64( \ + (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \ + /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \ + __m128i row1_0 = \ + _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j - \ + filter_offset - row_offset]); \ + __m256i r2 = _mm256_permute2x128_si256( \ + _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \ + \ + /* Pad end pixels to the right, while processing the last pixels in the \ + * row. */ \ + if (is_last_cols32) { \ + r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET), \ + wd32_end_pad_mask); \ + } \ + \ + /* Process even pixels of the first row */ \ + /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */ \ + s0[0] = _mm256_alignr_epi8(r1, r0, 0); \ + /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */ \ + s0[1] = _mm256_alignr_epi8(r1, r0, 2); \ + /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */ \ + s0[2] = _mm256_alignr_epi8(r1, r0, 4); \ + /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */ \ + s0[3] = _mm256_alignr_epi8(r1, r0, 6); \ + \ + /* Process even pixels of the second row */ \ + /* a13 a14 a15 a16 ..... a28 | b13 b14 b15 b16 ..... b28 */ \ + s1[0] = _mm256_alignr_epi8(r2, r1, 0); \ + /* a15 a16 a17 a18 ..... a30 | b15 b16 b17 b18 ..... b30 */ \ + s1[1] = _mm256_alignr_epi8(r2, r1, 2); \ + /* a17 a18 a19 a20 ..... a32 | b17 b18 b19 b20 ..... b32 */ \ + s1[2] = _mm256_alignr_epi8(r2, r1, 4); \ + /* a19 a20 a21 a22 ..... a34 | b19 b20 b21 b22 ..... b34 */ \ + s1[3] = _mm256_alignr_epi8(r2, r1, 6); \ + \ + /* The register res_out_0 stores the result of start-16 pixels corresponding \ + * to the first and second rows whereas res_out_1 stores the end-16 \ + * pixels. */ \ + __m256i res_out_0[2], res_out_1[2]; \ + res_out_1[0] = res_out_1[1] = zero; \ + res_out_0[0] = res_out_0[1] = zero; \ + resize_convolve(s0, coeffs_x, res_out_0); \ + resize_convolve(s1, coeffs_x, res_out_1); \ + \ + /* Result of 32 pixels of row0 (a0 to a32) */ \ + res_out_0[0] = _mm256_sra_epi32( \ + _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); \ + res_out_1[0] = _mm256_sra_epi32( \ + _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits); \ + /* r00-r03 r08-r011 | r04-r07 r012-r015 */ \ + __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \ + \ + /* Result of 32 pixels of row1 (b0 to b32) */ \ + res_out_0[1] = _mm256_sra_epi32( \ + _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \ + res_out_1[1] = _mm256_sra_epi32( \ + _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits); \ + /* r10-r13 r18-r111 | r14-r17 r112-r115 */ \ + __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]); \ + \ + /* Convert the result from 16bit to 8bit */ \ + /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115 \ + */ \ + __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1); \ + __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel); \ + res_out_row01 = _mm256_max_epu8(res_out_r01, zero); \ + __m128i low_128 = CAST_LOW(res_out_row01); \ + __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1); \ + \ + _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2], \ + _mm_unpacklo_epi32(low_128, high_128)); \ + _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2], \ + _mm_unpackhi_epi32(low_128, high_128)); + +static INLINE void resize_convolve(const __m256i *const s, + const __m256i *const coeffs, + __m256i *res_out) { + const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + const __m256i dst_0 = _mm256_add_epi16(res_0, res_1); + const __m256i dst_1 = _mm256_add_epi16(res_2, res_3); + // The sum of convolve operation crosses signed 16bit. Hence, the addition + // should happen in 32bit. + const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0)); + const __m256i dst_01 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1)); + const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1)); + const __m256i dst_11 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1)); + + res_out[0] = _mm256_add_epi32(dst_00, dst_10); + res_out[1] = _mm256_add_epi32(dst_01, dst_11); +} + +static INLINE void prepare_filter_coeffs(const int16_t *filter, + __m256i *const coeffs /* [4] */) { + // f0 f1 f2 f3 x x x x + const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); + // f0 f1 f2 f3 f0 f1 f2 f3 + const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44); + // f0 f1 f2 f3 f1 f0 f3 f2 + const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1); + + const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1); + + // f0 f1 f0 f1 .. + coeffs[2] = _mm256_broadcastw_epi16(filter_8bit); + // f2 f3 f2 f3 .. + coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2)); + // f3 f2 f3 f2 .. + coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6)); + // f1 f0 f1 f0 .. + coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4)); +} + +bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int stride, + int start_col) { + assert(start_col <= stride); + // For the GM tool, the input layer height or width is assured to be an even + // number. Hence the function 'down2_symodd()' is not invoked and SIMD + // optimization of the same is not implemented. + // When the input height is less than 8 and even, the potential input + // heights are limited to 2, 4, or 6. These scenarios require seperate + // handling due to padding requirements. Invoking the C function here will + // eliminate the need for conditional statements within the subsequent SIMD + // code to manage these cases. + if (height & 1 || height < 8) { + return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, start_col); + } + + __m256i s[10], coeffs_y[4]; + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const uint8_t max_pixel = 255; + const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel); + const __m256i zero = _mm256_setzero_si256(); + + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); + + const int num_col16 = stride / 16; + int remain_col = stride % 16; + // The core vertical SIMD processes 4 input rows simultaneously to generate + // output corresponding to 2 rows. To streamline the core loop and eliminate + // the need for conditional checks, the remaining rows (4 or 6) are processed + // separately. + const int remain_row = (height % 4 == 0) ? 4 : 6; + + for (int j = start_col; j < stride - remain_col; j += 16) { + const uint8_t *data = &intbuf[j]; + const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with the last available row at the top. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride)); + + // a0...a15 | c0...c15 + const __m256i s02 = + _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20); + // b0...b15 | d0...d15 + const __m256i s13 = + _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20); + // c0...c15 | e0...e15 + const __m256i s24 = + _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20); + // d0...d15 | f0...f15 + const __m256i s35 = + _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20); + // e0...e15 | g0...g15 + const __m256i s46 = + _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20); + // f0...f15 | h0...h15 + const __m256i s57 = + _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_unpacklo_epi8(s02, s13); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_unpacklo_epi8(s24, s35); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_unpacklo_epi8(s46, s57); + + // a8b8...a15b15 | c8d8...c15d15 + s[5] = _mm256_unpackhi_epi8(s02, s13); + // c8d8...c15d15 | e8f8...e15f15 + s[6] = _mm256_unpackhi_epi8(s24, s35); + // e8f8...e15f15 | g8h8...g15h15 + s[7] = _mm256_unpackhi_epi8(s46, s57); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + + _mm_storeu_si128( + (__m128i *)&output[(i / 2) * out_stride + j + out_stride], + _mm256_extracti128_si256(res_8bit0, 1)); + + // Load the required data for processing of next 4 input rows. + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride)); + + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + _mm256_extracti128_si256(res_8bit0, 1)); + i += 2; + + // Check if there is any remaining height to process. If so, perform the + // necessary data loading for processing the next row. + if (i < height - 1) { + l10 = l11 = l9; + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + } + } + + if (remain_col > 7) { + const int processed_wd = num_col16 * 16; + remain_col = stride % 8; + + const uint8_t *data = &intbuf[processed_wd]; + + const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with available top-most row. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride)); + + // a0b0...a7b7 + const __m128i s01 = _mm_unpacklo_epi8(l0, l1); + // c0d0...c7d7 + const __m128i s23 = _mm_unpacklo_epi8(l2, l3); + // e0f0...e7f7 + const __m128i s45 = _mm_unpacklo_epi8(l4, l5); + // g0h0...g7h7 + __m128i s67 = _mm_unpacklo_epi8(l10, l11); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride], + _mm256_extracti128_si256(res_a_round_1, 1)); + + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride)); + + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd], + _mm256_extracti128_si256(res_a_round_1, 1)); + i += 2; + + // Check rows are still remaining for processing. If yes do the required + // load of data for the next iteration. + if (i < height - 1) { + l10 = l11 = l9; + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + } + } + + if (remain_col) + return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, stride - remain_col); + + return true; +} + +// Masks used for width 32 and 8 pixels, with left and right padding +// requirements +static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, + 0, 0, 0, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12 }; + +static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 0, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2 }; + +static const uint8_t wd8_right_padding_mask[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 +}; + +void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, + uint8_t *intbuf, int height, int filtered_length, + int width2) { + assert(height % 2 == 0); + // Invoke SSE2 for width less than 32. + if (filtered_length < 32) { + av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length, + width2); + return; + } + + const int filt_length = sizeof(av1_down2_symeven_half_filter); + assert(filt_length % 2 == 0); + (void)filt_length; + + __m256i s0[4], s1[4], coeffs_x[4]; + + const int bits = FILTER_BITS; + const int dst_stride = width2; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + + const uint8_t max_pixel = 255; + const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i wd32_start_pad_mask = + _mm256_loadu_si256((__m256i *)wd32_left_padding_mask); + const __m256i wd32_end_pad_mask = + _mm256_loadu_si256((__m256i *)wd32_right_padding_mask); + const __m256i wd8_end_pad_mask = + _mm256_loadu_si256((__m256i *)wd8_right_padding_mask); + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x); + + // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously + // to generate output corresponding to 2 rows. To streamline the core loop and + // eliminate the need for conditional checks, the remaining columns (16 or 8) + // are processed separately. + if (filtered_length % 32 == 0) { + for (int i = 0; i < height; i += 2) { + int filter_offset = 0; + int row_offset = 0; + for (int j = 0; j < filtered_length; j += 32) { + PROCESS_RESIZE_X_WD32 + } + } + } else { + for (int i = 0; i < height; i += 2) { + int filter_offset = 0; + int remain_col = filtered_length; + int row_offset = 0; + // To avoid pixel over-read at frame boundary, processing of 32 pixels + // is done using the core loop only if sufficient number of pixels + // required for the load are present. The remaining pixels are processed + // separately. + for (int j = 0; j <= filtered_length - 32; j += 32) { + if (remain_col == 34 || remain_col == 36) { + break; + } + PROCESS_RESIZE_X_WD32 + remain_col -= 32; + } + + int wd_processed = filtered_length - remain_col; + // To avoid pixel over-read at frame boundary, processing of 16 pixels + // is done only if sufficient number of pixels required for the + // load are present. The remaining pixels are processed separately. + if (remain_col > 15 && remain_col != 18 && remain_col != 20) { + remain_col = filtered_length - wd_processed - 16; + const int in_idx = i * in_stride + wd_processed; + const int out_idx = (i * dst_stride) + wd_processed / 2; + // a0 a1 --- a15 + __m128i row0 = + _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]); + // b0 b1 --- b15 + __m128i row1 = _mm_loadu_si128( + (__m128i *)&input[in_idx + in_stride - filter_offset]); + // a0 a1 --- a15 || b0 b1 --- b15 + __m256i r0 = + _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); + if (filter_offset == 0) { + r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); + } + filter_offset = 3; + const int is_last_cols16 = wd_processed + 16 == filtered_length; + if (is_last_cols16) row_offset = ROW_OFFSET; + + // a16 a17 --- a23 + row0 = _mm_loadl_epi64( + (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]); + // b16 b17 --- b23 + row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride - + row_offset - filter_offset]); + + // a16-a23 x x x x| b16-b23 x x x x + __m256i r1 = + _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); + + // Pad end pixels to the right, while processing the last pixels in the + // row. + if (is_last_cols16) { + r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET), + wd32_end_pad_mask); + } + + // a0 a1 --- a15 || b0 b1 --- b15 + s0[0] = r0; + // a2 a3 --- a17 || b2 b3 --- b17 + s0[1] = _mm256_alignr_epi8(r1, r0, 2); + // a4 a5 --- a19 || b4 b5 --- b19 + s0[2] = _mm256_alignr_epi8(r1, r0, 4); + // a6 a7 --- a21 || b6 b7 --- b21 + s0[3] = _mm256_alignr_epi8(r1, r0, 6); + + // result for 16 pixels (a0 to a15) of row0 and row1 + __m256i res_out_0[2]; + res_out_0[0] = res_out_0[1] = zero; + resize_convolve(s0, coeffs_x, res_out_0); + + // r00-r07 + res_out_0[0] = _mm256_sra_epi32( + _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); + // r10-r17 + res_out_0[1] = _mm256_sra_epi32( + _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); + // r00-r03 r10-r13 r04-r07 r14-r17 + __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]); + // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17 + res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01); + res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel); + res_out_row01 = _mm256_max_epu8(res_out_row01, zero); + // r00-r03 r10-r13 r04-r07 r14-r17 + __m128i low_result = + CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8)); + // r00-r03 r04-r07 r10-r13 r14-r17 + low_result = _mm_shuffle_epi32(low_result, 0xd8); + + _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result); + _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride], + _mm_unpackhi_epi64(low_result, low_result)); + } + + // To avoid pixel over-read at frame boundary, processing of 8 pixels + // is done only if sufficient number of pixels required for the + // load are present. The remaining pixels are processed by C function. + wd_processed = filtered_length - remain_col; + if (remain_col > 7 && remain_col != 10 && remain_col != 12) { + remain_col = filtered_length - wd_processed - 8; + const int in_idx = i * in_stride + wd_processed - filter_offset; + const int out_idx = (i * dst_stride) + wd_processed / 2; + const int is_last_cols_8 = wd_processed + 8 == filtered_length; + if (is_last_cols_8) row_offset = ROW_OFFSET; + // a0 a1 --- a15 + __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]); + // b0 b1 --- b15 + __m128i row1 = + _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]); + // a0 a1 --- a15 || b0 b1 --- b15 + __m256i r0 = + _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); + + // Pad end pixels to the right, while processing the last pixels in the + // row. + if (is_last_cols_8) + r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET), + wd8_end_pad_mask); + + // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7 + s0[0] = r0; + // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9 + s0[1] = _mm256_bsrli_epi128(r0, 2); + // a4 a5 a6 a7 a8 a9 a10 a10 | b4 b5 b6 b7 b8 b9 b10 b10 + s0[2] = _mm256_bsrli_epi128(r0, 4); + // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10 + s0[3] = _mm256_bsrli_epi128(r0, 6); + + __m256i res_out_0[2]; + res_out_0[0] = res_out_0[1] = zero; + resize_convolve(s0, coeffs_x, res_out_0); + + // r00 - r03 | r10 - r13 + __m256i res_out = + _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20); + // r00 - r03 | r10 - r13 + res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits), + round_shift_bits); + // r00-r03 r00-r03 r10-r13 r10-r13 + __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out); + // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13 + res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01); + res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel); + res_out_row01 = _mm256_max_epu8(res_out_row01, zero); + + xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01)); + xx_storel_32(intbuf + out_idx + dst_stride, + _mm256_extracti128_si256(res_out_row01, 1)); + } + + wd_processed = filtered_length - remain_col; + if (remain_col) { + const int in_idx = (in_stride * i); + const int out_idx = (wd_processed / 2) + width2 * i; + + down2_symeven(input + in_idx, filtered_length, intbuf + out_idx, + wd_processed); + down2_symeven(input + in_idx + in_stride, filtered_length, + intbuf + out_idx + width2, wd_processed); + } + } + } +}
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c new file mode 100644 index 0000000..e2d84da --- /dev/null +++ b/av1/common/x86/resize_sse2.c
@@ -0,0 +1,342 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> + +#include "config/av1_rtcd.h" + +#include "av1/common/resize.h" + +#include "aom_dsp/x86/synonyms.h" + +#define ROW_OFFSET 5 + +#define PROCESS_RESIZE_Y_WD8 \ + /* ah0 ah1 ... ah7 */ \ + const __m128i AH = _mm_add_epi16(l0, l7); \ + /* bg0 bg1 ... bh7 */ \ + const __m128i BG = _mm_add_epi16(l1, l6); \ + /* cf0 cf1 ... cf7 */ \ + const __m128i CF = _mm_add_epi16(l2, l5); \ + /* de0 de1 ... de7 */ \ + const __m128i DE = _mm_add_epi16(l3, l4); \ + \ + /* ah0 bg0 ... ah3 bg3 */ \ + const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \ + /*cf0 de0 ... cf2 de2 */ \ + const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \ + \ + /* ah4 bg4... ah7 bg7 */ \ + const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \ + /* cf4 de4... cf7 de7 */ \ + const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \ + \ + /* r00 r01 r02 r03 */ \ + const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \ + const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \ + __m128i r0 = _mm_add_epi32(r00, r01); \ + /* r04 r05 r06 r07 */ \ + const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \ + const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \ + __m128i r1 = _mm_add_epi32(r10, r11); \ + \ + r0 = _mm_add_epi32(r0, round_const_bits); \ + r1 = _mm_add_epi32(r1, round_const_bits); \ + r0 = _mm_sra_epi32(r0, round_shift_bits); \ + r1 = _mm_sra_epi32(r1, round_shift_bits); \ + \ + /* r00 ... r07 (8 values of each 16bit) */ \ + const __m128i res_16b = _mm_packs_epi32(r0, r1); \ + /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */ \ + const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \ + \ + __m128i res = _mm_min_epu8(res_8b0, clip_pixel); \ + res = _mm_max_epu8(res, zero); \ + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \ + \ + l0 = l2; \ + l1 = l3; \ + l2 = l4; \ + l3 = l5; \ + l4 = l6; \ + l5 = l7; \ + data += 2 * stride; + +static INLINE void prepare_filter_coeffs(const int16_t *filter, + __m128i *const coeffs /* [2] */) { + // f0 f1 f2 f3 x x x x + const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); + + // f1 f0 f3 f2 x x x x + const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1); + + // f3 f2 f3 f2 ... + coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55); + // f1 f0 f1 f0 ... + coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00); +} + +bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int stride, + int start_col) { + // For the GM tool, the input layer height or width is assured to be an even + // number. Hence the function 'down2_symodd()' is not invoked and SIMD + // optimization of the same is not implemented. + // When the input height is less than 8 and even, the potential input + // heights are limited to 2, 4, or 6. These scenarios require seperate + // handling due to padding requirements. Invoking the C function here will + // eliminate the need for conditional statements within the subsequent SIMD + // code to manage these cases. + if (height & 1 || height < 8) { + return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, start_col); + } + + __m128i coeffs_y[2]; + const int bits = FILTER_BITS; + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const uint8_t max_pixel = 255; + const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel); + const __m128i zero = _mm_setzero_si128(); + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); + + const int remain_col = stride % 8; + + for (int j = start_col; j < stride - remain_col; j += 8) { + uint8_t *data = &intbuf[j]; + // d0 ... d7 + const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with the last available row at the top. + // a0 ... a7 + const __m128i l8_0 = l8_3; + // b0 ... b7 + const __m128i l8_1 = l8_3; + // c0 ... c7 + const __m128i l8_2 = l8_3; + // e0 ... e7 + const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); + // f0 ... f7 + const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); + + // Convert to 16bit as addition of 2 source pixel crosses 8 bit. + __m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit) + __m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit) + __m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit) + __m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit) + __m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit) + __m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit) + + // Increment the pointer such that the loading starts from row G. + data = data + 3 * stride; + // The core vertical SIMD processes 2 input rows simultaneously to generate + // output corresponding to 1 row. To streamline the core loop and eliminate + // the need for conditional checks, the remaining rows 4 are processed + // separately. + for (int i = 0; i < height - 4; i += 2) { + // g0 ... g7 + __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); + // h0 ... h7 + __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride)); + __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b) + __m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b) + + PROCESS_RESIZE_Y_WD8 + } + + __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); + __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); + // Process the last 4 input rows here. + for (int i = height - 4; i < height; i += 2) { + __m128i l7 = l6; + PROCESS_RESIZE_Y_WD8 + } + } + + if (remain_col) + return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, stride - remain_col); + + return true; +} + +// Blends a and b using mask and returns the result. +static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) { + const __m128i masked_b = _mm_and_si128(mask, b); + const __m128i masked_a = _mm_andnot_si128(mask, a); + return (_mm_or_si128(masked_a, masked_b)); +} + +// Masks used for width 16 pixels, with left and right padding +// requirements. +static const uint8_t left_padding_mask[16] = { + 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const uint8_t right_padding_mask[16] = { 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 255, 255, + 255, 255, 255, 255 }; + +static const uint8_t mask_16[16] = { + 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, +}; + +void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride, + uint8_t *intbuf, int height, int filtered_length, + int width2) { + assert(height % 2 == 0); + // Invoke C for width less than 16. + if (filtered_length < 16) { + av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length, + width2); + return; + } + + __m128i coeffs_x[2]; + const int bits = FILTER_BITS; + const int dst_stride = width2; + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const uint8_t max_pixel = 255; + const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel); + const __m128i zero = _mm_setzero_si128(); + + const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask); + const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask); + const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16); + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x); + + for (int i = 0; i < height; ++i) { + int filter_offset = 0; + int row01_offset = ROW_OFFSET; + int remain_col = filtered_length; + // To avoid pixel over-read at frame boundary, processing of 16 pixels + // is done using the core loop only if sufficient number of pixels required + // for the load are present.The remaining pixels are processed separately. + for (int j = 0; j <= filtered_length - 16; j += 16) { + if (remain_col == 18 || remain_col == 20) { + break; + } + const int is_last_cols16 = (j == filtered_length - 16); + // While processing the last 16 pixels of the row, ensure that only valid + // pixels are loaded. + if (is_last_cols16) row01_offset = 0; + const int in_idx = i * in_stride + j - filter_offset; + const int out_idx = i * dst_stride + j / 2; + remain_col -= 16; + // a0 a1 a2 a3 .... a15 + __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]); + // a8 a9 a10 a11 .... a23 + __m128i row01 = _mm_loadu_si128( + (__m128i *)&input[in_idx + row01_offset + filter_offset]); + filter_offset = 3; + + // Pad start pixels to the left, while processing the first pixels in the + // row. + if (j == 0) { + const __m128i start_pixel_row0 = + _mm_set1_epi8((char)input[i * in_stride]); + row00 = + blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask); + } + + // Pad end pixels to the right, while processing the last pixels in the + // row. + if (is_last_cols16) { + const __m128i end_pixel_row0 = + _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]); + row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0, + end_pad_mask); + } + + // a2 a3 a4 a5 a6 a7 a8 a9 .... a17 + const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2), + _mm_srli_si128(row01, 2)); + // a4 a5 a6 a7 a9 10 a11 a12 .... a19 + const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4), + _mm_srli_si128(row01, 4)); + // a6 a7 a8 a9 a10 a11 a12 a13 .... a21 + const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6), + _mm_srli_si128(row01, 6)); + + // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit) + const __m128i s0 = _mm_and_si128(row00, mask_even); + // a1 a3 a5 a7 a9 a11 a13 a15 + const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even); + // a2 a4 a6 a8 a10 a12 a14 a16 + const __m128i s2 = _mm_and_si128(row0_1, mask_even); + // a3 a5 a7 a9 a11 a13 a15 a17 + const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even); + // a4 a6 a8 a10 a12 a14 a16 a18 + const __m128i s4 = _mm_and_si128(row0_2, mask_even); + // a5 a7 a9 a11 a13 a15 a17 a19 + const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even); + // a6 a8 a10 a12 a14 a16 a18 a20 + const __m128i s6 = _mm_and_si128(row0_3, mask_even); + // a7 a9 a11 a13 a15 a17 a19 a21 + const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even); + + // a0a7 a2a9 a4a11 .... a12a19 a14a21 + const __m128i s07 = _mm_add_epi16(s0, s7); + // a1a6 a3a8 a5a10 .... a13a18 a15a20 + const __m128i s16 = _mm_add_epi16(s1, s6); + // a2a5 a4a7 a6a9 .... a14a17 a16a19 + const __m128i s25 = _mm_add_epi16(s2, s5); + // a3a4 a5a6 a7a8 .... a15a16 a17a18 + const __m128i s34 = _mm_add_epi16(s3, s4); + + // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12 + const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16); + // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10 + const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34); + + // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20 + const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16); + // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18 + const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34); + + const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]); + const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]); + const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]); + const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]); + + // Result of first 8 pixels of row0 (a0 to a7). + // r0_0 r0_1 r0_2 r0_3 + __m128i r00 = _mm_add_epi32(r01_0, r01_1); + r00 = _mm_add_epi32(r00, round_const_bits); + r00 = _mm_sra_epi32(r00, round_shift_bits); + + // Result of next 8 pixels of row0 (a8 to 15). + // r0_4 r0_5 r0_6 r0_7 + __m128i r01 = _mm_add_epi32(r01_2, r01_3); + r01 = _mm_add_epi32(r01, round_const_bits); + r01 = _mm_sra_epi32(r01, round_shift_bits); + + // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7 + const __m128i res_16 = _mm_packs_epi32(r00, r01); + const __m128i res_8 = _mm_packus_epi16(res_16, res_16); + __m128i res = _mm_min_epu8(res_8, clip_pixel); + res = _mm_max_epu8(res, zero); + + // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7 + _mm_storel_epi64((__m128i *)&intbuf[out_idx], res); + } + + int wd_processed = filtered_length - remain_col; + if (remain_col) { + const int in_idx = (in_stride * i); + const int out_idx = (wd_processed / 2) + width2 * i; + + down2_symeven(input + in_idx, filtered_length, intbuf + out_idx, + wd_processed); + } + } +}
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index e3cce40..d256512 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c
@@ -14,20 +14,23 @@ #include <stddef.h> #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" -#include "config/av1_rtcd.h" #include "aom/aom_codec.h" +#include "aom/aom_image.h" +#include "aom/internal/aom_codec_internal.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_reader.h" #include "aom_dsp/bitreader.h" #include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_ports/mem_ops.h" #include "aom_scale/aom_scale.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG @@ -35,33 +38,41 @@ #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" #include "av1/common/cdef.h" #include "av1/common/cfl.h" -#if CONFIG_INSPECTION -#include "av1/decoder/inspection.h" -#endif +#include "av1/common/common_data.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" +#include "av1/common/enums.h" #include "av1/common/frame_buffers.h" #include "av1/common/idct.h" +#include "av1/common/mv.h" #include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" #include "av1/common/pred_common.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "av1/common/scale.h" #include "av1/common/seg_common.h" #include "av1/common/thread_common.h" #include "av1/common/tile_common.h" #include "av1/common/warped_motion.h" -#include "av1/common/obmc.h" + #include "av1/decoder/decodeframe.h" #include "av1/decoder/decodemv.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodetxb.h" #include "av1/decoder/detokenize.h" +#if CONFIG_INSPECTION +#include "av1/decoder/inspection.h" +#endif #define ACCT_STR __func__ @@ -1935,8 +1946,8 @@ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment, - &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0, - 0)) { + &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, + false, 0)) { unlock_buffer_pool(pool); aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -2230,6 +2241,12 @@ if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) { // The remaining bits in the top byte signal the row offset int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f; + if (offset > row) { + aom_internal_error( + error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid row offset in tile copy mode: row=%d offset=%d", row, + offset); + } // Currently, only use tiles in same column as reference tiles. copy_data = tile_buffers[row - offset][col].data; @@ -2293,7 +2310,11 @@ const int tile_col_size_bytes = pbi->tile_col_size_bytes; const int tile_size_bytes = pbi->tile_size_bytes; int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + aom_internal_error( + &pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Not all the tiles in the tile list have the same size."); + } const int tile_copy_mode = ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; // Read tile column sizes for all columns (we need the last tile buffer) @@ -2302,8 +2323,16 @@ size_t tile_col_size; if (!is_last) { + if (tile_col_size_bytes > data_end - data) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Not enough data to read tile_col_size"); + } tile_col_size = mem_get_varsize(data, tile_col_size_bytes); data += tile_col_size_bytes; + if (tile_col_size > (size_t)(data_end - data)) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "tile_col_data_end[%d] is out of bound", c); + } tile_col_data_end[c] = data + tile_col_size; } else { tile_col_size = data_end - data; @@ -2440,6 +2469,7 @@ const int n_tiles) { AV1_COMMON *const cm = &pbi->common; aom_free(pbi->tile_data); + pbi->allocated_tiles = 0; CHECK_MEM_ERROR(cm, pbi->tile_data, aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); pbi->allocated_tiles = n_tiles; @@ -3180,18 +3210,16 @@ pthread_mutex_lock(pbi->row_mt_mutex_); #endif frame_row_mt_info->row_mt_exit = 1; - +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif // If any SB row (erroneous row) processed by a thread encounters an // internal error, there is a need to indicate other threads that decoding // of the erroneous row is complete. This ensures that other threads which // wait upon the completion of SB's present in erroneous row are not waiting // indefinitely. signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd); - -#if CONFIG_MULTITHREAD - pthread_cond_broadcast(pbi->row_mt_cond_); - pthread_mutex_unlock(pbi->row_mt_mutex_); -#endif return 0; } thread_data->error_info.setjmp = 1; @@ -3872,8 +3900,8 @@ #endif } -void av1_read_film_grain_params(AV1_COMMON *cm, - struct aom_read_bit_buffer *rb) { +static void read_film_grain_params(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { aom_film_grain_t *pars = &cm->film_grain_params; const SequenceHeader *const seq_params = cm->seq_params; @@ -4041,7 +4069,7 @@ struct aom_read_bit_buffer *rb) { if (cm->seq_params->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { - av1_read_film_grain_params(cm, rb); + read_film_grain_params(cm, rb); } else { memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } @@ -4769,7 +4797,7 @@ seq_params->max_frame_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, features->byte_alignment, - &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0, + &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, false, 0)) { decrease_ref_count(buf, pool); unlock_buffer_pool(pool);
diff --git a/av1/decoder/decodemv.h b/av1/decoder/decodemv.h index 3d8629c..7e77c03 100644 --- a/av1/decoder/decodemv.h +++ b/av1/decoder/decodemv.h
@@ -20,6 +20,8 @@ extern "C" { #endif +int av1_neg_deinterleave(int diff, int ref, int max); + void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, aom_reader *r, int x_mis, int y_mis);
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c index 33554a6..a886ed4 100644 --- a/av1/decoder/decoder.c +++ b/av1/decoder/decoder.c
@@ -21,6 +21,7 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_scale/aom_scale.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/alloccommon.h" @@ -79,9 +80,10 @@ static void dec_free_mi(CommonModeInfoParams *mi_params) { aom_free(mi_params->mi_alloc); mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; aom_free(mi_params->mi_grid_base); mi_params->mi_grid_base = NULL; - mi_params->mi_alloc_size = 0; + mi_params->mi_grid_size = 0; aom_free(mi_params->tx_type_map); mi_params->tx_type_map = NULL; }
diff --git a/av1/decoder/dthread.h b/av1/decoder/dthread.h index f82b9d8..b0f6fda 100644 --- a/av1/decoder/dthread.h +++ b/av1/decoder/dthread.h
@@ -14,7 +14,6 @@ #include "config/aom_config.h" -#include "aom_util/aom_thread.h" #include "aom/internal/aom_codec_internal.h" #ifdef __cplusplus
diff --git a/av1/decoder/obu.c b/av1/decoder/obu.c index b687cf9..e0b2d87 100644 --- a/av1/decoder/obu.c +++ b/av1/decoder/obu.c
@@ -76,7 +76,7 @@ return 0; } -static uint32_t read_temporal_delimiter_obu() { return 0; } +static uint32_t read_temporal_delimiter_obu(void) { return 0; } // Returns a boolean that indicates success. static int read_bitstream_level(AV1_LEVEL *seq_level_idx, @@ -367,16 +367,13 @@ return header_size + tg_payload_size; } -static void alloc_tile_list_buffer(AV1Decoder *pbi) { +static void alloc_tile_list_buffer(AV1Decoder *pbi, int tile_width_in_pixels, + int tile_height_in_pixels) { // The resolution of the output frame is read out from the bitstream. The data // are stored in the order of Y plane, U plane and V plane. As an example, for // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the // output frame. AV1_COMMON *const cm = &pbi->common; - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_width_in_pixels = tile_width * MI_SIZE; - const int tile_height_in_pixels = tile_height * MI_SIZE; const int output_frame_width = (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels; const int output_frame_height = @@ -396,7 +393,7 @@ cm->seq_params->subsampling_y, (cm->seq_params->use_highbitdepth && (cm->seq_params->bit_depth > AOM_BITS_8)), - 0, cm->features.byte_alignment, 0, 0)) + 0, cm->features.byte_alignment, false, 0)) aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate the tile list output buffer"); } @@ -424,13 +421,10 @@ return; } -static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, - int tile_idx) { +static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, int tile_idx, + int tile_width_in_pixels, + int tile_height_in_pixels) { AV1_COMMON *const cm = &pbi->common; - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_width_in_pixels = tile_width * MI_SIZE; - const int tile_height_in_pixels = tile_height * MI_SIZE; const int ssy = cm->seq_params->subsampling_y; const int ssx = cm->seq_params->subsampling_x; const int num_planes = av1_num_planes(cm); @@ -501,13 +495,31 @@ pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); + + // The output frame is used to store the decoded tile list. The decoded tile + // list has to fit into 1 output frame. + if ((pbi->tile_count_minus_1 + 1) > + (pbi->output_frame_width_in_tiles_minus_1 + 1) * + (pbi->output_frame_height_in_tiles_minus_1 + 1)) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + if (pbi->tile_count_minus_1 > MAX_TILES - 1) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } + int tile_width, tile_height; + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; + // Allocate output frame buffer for the tile list. - alloc_tile_list_buffer(pbi); + alloc_tile_list_buffer(pbi, tile_width_in_pixels, tile_height_in_pixels); uint32_t tile_list_info_bytes = 4; tile_list_payload_size += tile_list_info_bytes; @@ -558,7 +570,8 @@ assert(data <= data_end); // Copy the decoded tile to the tile list output buffer. - copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx); + copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx, tile_width_in_pixels, + tile_height_in_pixels); tile_idx++; }
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c index a59d0d7..87becb8 100644 --- a/av1/encoder/allintra_vis.c +++ b/av1/encoder/allintra_vis.c
@@ -13,6 +13,8 @@ #include "config/aom_config.h" +#include "aom_util/aom_pthread.h" + #if CONFIG_TFLITE #include "tensorflow/lite/c/c_api.h" #include "av1/encoder/deltaq4_model.c" @@ -270,13 +272,14 @@ const int coeff_count = block_size * block_size; const int mb_step = mi_size_wide[bsize]; const BitDepthInfo bd_info = get_bit_depth_info(xd); - const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; + const MultiThreadInfo *const mt_info = &cpi->mt_info; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; AV1EncRowMultiThreadSync *const intra_row_mt_sync = &cpi->ppi->intra_row_mt_sync; const int mi_cols = cm->mi_params.mi_cols; const int mt_thread_id = mi_row / mb_step; // TODO(chengchen): test different unit step size - const int mt_unit_step = mi_size_wide[BLOCK_64X64]; + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step; int mt_unit_col = 0; const int is_high_bitdepth = is_cur_buf_hbd(xd); @@ -293,6 +296,18 @@ if (mi_col % mt_unit_step == 0) { intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id, mt_unit_col); +#if CONFIG_MULTITHREAD + const int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); + if (num_workers > 1) { + const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + pthread_mutex_lock(enc_row_mt->mutex_); + const bool exit = enc_row_mt->mb_wiener_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Stop further processing in case any worker has encountered an error. + if (exit) break; + } +#endif } PREDICTION_MODE best_mode = DC_PRED; @@ -575,7 +590,7 @@ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, - NULL, cpi->image_pyramid_levels, 0)) + NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td);
diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h index ab39968..0d34ce0 100644 --- a/av1/encoder/allintra_vis.h +++ b/av1/encoder/allintra_vis.h
@@ -20,6 +20,8 @@ #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" +#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64 + void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi); void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c index f48ff11..2ef6cba 100644 --- a/av1/encoder/aq_cyclicrefresh.c +++ b/av1/encoder/aq_cyclicrefresh.c
@@ -15,6 +15,7 @@ #include "av1/common/pred_common.h" #include "av1/common/seg_common.h" #include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" @@ -102,15 +103,15 @@ weight_segment2 = 0; } // Take segment weighted average for estimated bits. - const int estimated_bits = - (int)((1.0 - weight_segment1 - weight_segment2) * - av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) + - weight_segment1 * - av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1], - correction_factor) + - weight_segment2 * - av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2], - correction_factor)); + const int estimated_bits = (int)round( + (1.0 - weight_segment1 - weight_segment2) * + av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) + + weight_segment1 * + av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1], + correction_factor) + + weight_segment2 * + av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2], + correction_factor)); return estimated_bits; } @@ -138,13 +139,13 @@ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate; // Take segment weighted average for bits per mb. - bits_per_mb = - (int)((1.0 - weight_segment) * - av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i, - correction_factor, accurate_estimate) + - weight_segment * av1_rc_bits_per_mb( - cpi, cm->current_frame.frame_type, i + deltaq, - correction_factor, accurate_estimate)); + bits_per_mb = (int)round( + (1.0 - weight_segment) * + av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i, + correction_factor, accurate_estimate) + + weight_segment * av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, + i + deltaq, correction_factor, + accurate_estimate)); return bits_per_mb; } @@ -295,6 +296,7 @@ const CommonModeInfoParams *const mi_params = &cm->mi_params; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->enc_seg.map; + unsigned char *const active_map_4x4 = cpi->active_map.map; int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; uint64_t sb_sad = 0; @@ -302,7 +304,12 @@ uint64_t thresh_sad = INT64_MAX; const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols; const int mi_stride = mi_cols; - memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); + // Don't set seg_map to 0 if active_maps is enabled. Active_maps will set + // seg_map to either 7 or 0 (AM_SEGMENT_ID_INACTIVE/ACTIVE), and cyclic + // refresh set below (segment 1 or 2) will only be set for ACTIVE blocks. + if (!cpi->active_map.enabled) { + memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); + } sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; sbs_in_frame = sb_cols * sb_rows; @@ -357,7 +364,10 @@ // for possible boost/refresh (segment 1). The segment id may get // reset to 0 later if block gets coded anything other than low motion. // If the block_sad (sb_sad) is very low label it for refresh anyway. - if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) { + // If active_maps is enabled, only allow for setting on ACTIVE blocks. + if ((cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) && + (!cpi->active_map.enabled || + active_map_4x4[bl_index2] == AM_SEGMENT_ID_ACTIVE)) { sum_map += 4; } else if (cr->map[bl_index2] < 0) { cr->map[bl_index2]++; @@ -380,7 +390,8 @@ cr->sb_index = i; if (cr->target_num_seg_blocks == 0) { // Disable segmentation, seg_map is already set to 0 above. - av1_disable_segmentation(&cm->seg); + // Don't disable if active_map is being used. + if (!cpi->active_map.enabled) av1_disable_segmentation(&cm->seg); } } @@ -423,8 +434,6 @@ // function av1_cyclic_reset_segment_skip(). Skipping over // 4x4 will therefore have small bdrate loss (~0.2%), so // we use it only for speed > 9 for now. - // Also if loop-filter deltas is applied via segment, then - // we need to set cr->skip_over4x4 = 1. cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0; // should we enable cyclic refresh on this frame. @@ -450,6 +459,15 @@ else cr->percent_refresh = 10 + cr->percent_refresh_adjustment; + if (cpi->active_map.enabled) { + // Scale down the percent_refresh to target the active blocks only. + cr->percent_refresh = + cr->percent_refresh * (100 - cpi->rc.percent_blocks_inactive) / 100; + if (cr->percent_refresh == 0) { + cr->apply_cyclic_refresh = 0; + } + } + cr->max_qdelta_perc = 60; cr->time_for_refresh = 0; cr->use_block_sad_scene_det = @@ -543,10 +561,14 @@ if (resolution_change) av1_cyclic_refresh_reset_resize(cpi); if (!cr->apply_cyclic_refresh) { - // Set segmentation map to 0 and disable. - unsigned char *const seg_map = cpi->enc_seg.map; - memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); - av1_disable_segmentation(&cm->seg); + // Don't disable and set seg_map to 0 if active_maps is enabled, unless + // whole frame is set as inactive (since we only apply cyclic_refresh to + // active blocks). + if (!cpi->active_map.enabled || cpi->rc.percent_blocks_inactive == 100) { + unsigned char *const seg_map = cpi->enc_seg.map; + memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_disable_segmentation(&cm->seg); + } if (frame_is_intra_only(cm) || scene_change_detected || cpi->ppi->rtc_ref.bias_recovery_frame) { cr->sb_index = 0; @@ -574,9 +596,11 @@ cr->thresh_rate_sb = INT64_MAX; } // Set up segmentation. - // Clear down the segment map. av1_enable_segmentation(&cm->seg); - av1_clearall_segfeatures(seg); + if (!cpi->active_map.enabled) { + // Clear down the segment map, only if active_maps is not enabled. + av1_clearall_segfeatures(seg); + } // Note: setting temporal_update has no effect, as the seg-map coding method // (temporal or spatial) is determined in @@ -644,6 +668,10 @@ int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int qindex = cpi->common.quant_params.base_qindex; + if (cpi->active_map.enabled && + cpi->rc.percent_blocks_inactive > + cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef) + return 1; if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 && cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh && cpi->rc.frame_source_sad < 1000 &&
diff --git a/av1/encoder/arm/av1_error_neon.c b/av1/encoder/arm/av1_error_neon.c new file mode 100644 index 0000000..1d4299f --- /dev/null +++ b/av1/encoder/arm/av1_error_neon.c
@@ -0,0 +1,96 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can only accumulate 2 squares. + int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_s64x2(ssz_s64); + return (int64_t)horizontal_add_u64x2(err_u64); +} + +int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + uint64x2_t err_u64 = vdupq_n_u64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return (int64_t)horizontal_add_u64x2(err_u64); +}
diff --git a/av1/encoder/arm/av1_error_sve.c b/av1/encoder/arm/av1_error_sve.c new file mode 100644 index 0000000..5a1ad2f --- /dev/null +++ b/av1/encoder/arm/av1_error_sve.c
@@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" + +int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0); + sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1])); + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} + +int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + if (block_size % 32 == 0) { + int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t c2 = vld1q_s16(coeff + 16); + const int16x8_t c3 = vld1q_s16(coeff + 24); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + const int16x8_t d2 = vld1q_s16(dqcoeff + 16); + const int16x8_t d3 = vld1q_s16(dqcoeff + 24); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + const int16x8_t diff2 = vsubq_s16(c2, d2); + const int16x8_t diff3 = vsubq_s16(c3, d3); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + error[2] = aom_sdotq_s16(error[2], diff2, diff2); + error[3] = aom_sdotq_s16(error[3], diff3, diff3); + + coeff += 32; + dqcoeff += 32; + block_size -= 32; + } while (block_size != 0); + + error[0] = vaddq_s64(error[0], error[1]); + error[2] = vaddq_s64(error[2], error[3]); + error[0] = vaddq_s64(error[0], error[2]); + return vaddvq_s64(error[0]); + } + assert(block_size == 16); + + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return vaddvq_s64(vaddq_s64(error[0], error[1])); +}
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/av1_fwd_txfm2d_neon.c similarity index 94% rename from av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c rename to av1/encoder/arm/av1_fwd_txfm2d_neon.c index a17a41a..5148ee7 100644 --- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c +++ b/av1/encoder/arm/av1_fwd_txfm2d_neon.c
@@ -1598,44 +1598,6 @@ int32_t *output, int stride, int cos_bit); -static const col_transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = { - fdct4x4_col_neon, // DCT_DCT - fadst4x4_col_neon, // ADST_DCT - fdct4x4_col_neon, // DCT_ADST - fadst4x4_col_neon, // ADST_ADST - fadst4x4_col_neon, // FLIPADST_DCT - fdct4x4_col_neon, // DCT_FLIPADST - fadst4x4_col_neon, // FLIPADST_FLIPADST - fadst4x4_col_neon, // ADST_FLIPADST - fadst4x4_col_neon, // FLIPADST_ADST - fidentity4x4_col_neon, // IDTX - fdct4x4_col_neon, // V_DCT - fidentity4x4_col_neon, // H_DCT - fadst4x4_col_neon, // V_ADST - fidentity4x4_col_neon, // H_ADST - fadst4x4_col_neon, // V_FLIPADST - fidentity4x4_col_neon // H_FLIPADST -}; - -static const row_transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = { - fdct4x4_row_neon, // DCT_DCT - fdct4x4_row_neon, // ADST_DCT - fadst4x4_row_neon, // DCT_ADST - fadst4x4_row_neon, // ADST_ADST - fdct4x4_row_neon, // FLIPADST_DCT - fadst4x4_row_neon, // DCT_FLIPADST - fadst4x4_row_neon, // FLIPADST_FLIPADST - fadst4x4_row_neon, // ADST_FLIPADST - fadst4x4_row_neon, // FLIPADST_ADST - fidentity4x4_row_neon, // IDTX - fidentity4x4_row_neon, // V_DCT - fdct4x4_row_neon, // H_DCT - fidentity4x4_row_neon, // V_ADST - fadst4x4_row_neon, // H_ADST - fidentity4x4_row_neon, // V_FLIPADST - fadst4x4_row_neon // H_FLIPADST -}; - static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = { fdct4x8_col_neon, // DCT_DCT fadst4x8_col_neon, // ADST_DCT @@ -1943,21 +1905,96 @@ static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; - int16x4_t buf0[4], buf1[4]; - const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type]; - const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type]; int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); - col_txfm(input, buf0, stride, 13); - transpose_arrays_s16_4x4(buf0, buf1); - if (lr_flip) { - flip_buf_4_neon(buf1, buf0, 4); - row_txfm(buf0, output, 4, 13); - } else { - row_txfm(buf1, output, 4, 13); + int16x4_t buf0[4], buf1[4]; + switch (tx_type) { + case DCT_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_ADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case FLIPADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_FLIPADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case ADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case IDTX: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case V_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_DCT: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case V_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_ADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case V_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_FLIPADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; } } @@ -2040,22 +2077,113 @@ static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; - int16x8_t buf0[8], buf1[8]; - const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; - const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); - col_txfm(input, buf0, stride, 13); - shift_right_1_round_s16_x8(buf0, buf0, 8); - transpose_arrays_s16_8x8(buf0, buf1); - if (lr_flip) { - flip_buf_8_neon(buf1, buf0, 8); - row_txfm(buf0, output, 8, 13); - } else { - row_txfm(buf1, output, 8, 13); + int16x8_t buf0[8], buf1[8]; + + switch (tx_type) { + case DCT_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_ADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case FLIPADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_FLIPADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case ADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case IDTX: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case V_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_DCT: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case V_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_ADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case V_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_FLIPADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; } } @@ -2376,8 +2504,8 @@ } } -static void fdct32_new_neon(const int32x4_t *input, int32x4_t *output, - int cos_bit) { +static void fdct32_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); @@ -2598,8 +2726,8 @@ output[31] = buf0[31]; } -static void fdct64_new_neon(const int32x4_t *input, int32x4_t *output, - int cos_bit) { +static void fdct64_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); @@ -2853,8 +2981,8 @@ bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } - fdct64_new_neon(bufA, bufA, 10); - fdct64_new_neon(bufB, bufB, 10); + fdct64_neon(bufA, bufA, 10); + fdct64_neon(bufB, bufB, 10); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); @@ -2883,8 +3011,8 @@ bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } - fdct64_new_neon(bufA, bufA, 11); - fdct64_new_neon(bufB, bufB, 11); + fdct64_neon(bufA, bufA, 11); + fdct64_neon(bufB, bufB, 11); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); @@ -2918,8 +3046,8 @@ bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } - fdct32_new_neon(bufA, bufA, 11); - fdct32_new_neon(bufB, bufB, 11); + fdct32_neon(bufA, bufA, 11); + fdct32_neon(bufB, bufB, 11); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
diff --git a/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c similarity index 100% rename from av1/encoder/arm/neon/av1_highbd_quantize_neon.c rename to av1/encoder/arm/av1_highbd_quantize_neon.c
diff --git a/av1/encoder/arm/neon/av1_k_means_neon.c b/av1/encoder/arm/av1_k_means_neon.c similarity index 98% rename from av1/encoder/arm/neon/av1_k_means_neon.c rename to av1/encoder/arm/av1_k_means_neon.c index d13cc65..5863769 100644 --- a/av1/encoder/arm/neon/av1_k_means_neon.c +++ b/av1/encoder/arm/av1_k_means_neon.c
@@ -12,7 +12,7 @@ #include "aom_dsp/arm/sum_neon.h" #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" static int32x4_t k_means_multiply_add_neon(const int16x8_t a) { const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
diff --git a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/av1/encoder/arm/av1_temporal_denoiser_neon.c similarity index 100% rename from av1/encoder/arm/neon/av1_temporal_denoiser_neon.c rename to av1/encoder/arm/av1_temporal_denoiser_neon.c
diff --git a/av1/encoder/arm/cnn_neon.c b/av1/encoder/arm/cnn_neon.c new file mode 100644 index 0000000..8e68626 --- /dev/null +++ b/av1/encoder/arm/cnn_neon.c
@@ -0,0 +1,1144 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <math.h> +#include <stdbool.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_cnn_weights.h" + +// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are +// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in +// partition_cnn_weights.h. However, to enable linear memory access, rearrange +// the weight tables here. +static const float weights_layer_1[] = { + 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f, + -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f, + 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f, + -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f, + 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f, + 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f, + 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f, + -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f, + -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f, + 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f, + -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f, + 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f, + 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f, + -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f, + 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f, + -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f, + -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f, + -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f, + -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f, + -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f, + 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f, + 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f, + -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f, + -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f, + -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f, + 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f, + 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f, + 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f, + 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f, + 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f, + 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f, + 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f, + 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f, + 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f, + 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f, + 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f, + 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f, + 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f, + 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f, + 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f, + 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f, + 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f, + -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f, + 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f, + 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f, + 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f, + 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f, + 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f, + 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f, + -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f, + 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f, + -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f, + -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f, + 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f, + 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f, + -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f, + -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f, + -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f, + 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f, + -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f, + 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f, + -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f, + 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f, + -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f, + 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f, + -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f, + 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f, + -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f, + -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f, + 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f, + 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f, + 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f, + 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f, + -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f, + 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f, + 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f, + 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f, + -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f, + 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f, + 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f, + 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f, + 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f, + 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f, + 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f, + -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f, + 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f, + 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f, + 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f, + 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f, + -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f, + -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f, + 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f, + -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f, + -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f, + 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f, + 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f, + 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f, + 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f, + 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f, + -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f, + 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f, + -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f, + -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f, + -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f, + 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f, + 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f, + 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f, + -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f, + -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f, + -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f, + 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f, + 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f, + -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f, + -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f, + 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f, + 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f, + 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f, + 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f, + 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f, + -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f, + 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f, + -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f, + 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f, + -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f, + -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f, + 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f, + -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f, + 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f, + -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f, + -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f, + -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f, + -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f, + -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f, + 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f, + -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f, + 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f, + -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f, + 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f, + -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f, + -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f, + -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f, + -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f, + -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f, + 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f, + -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f, + 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f, + -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f, + -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f, + 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f, + 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f, + -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f, + -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f, + 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f, + -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f, + -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f, + 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f, + -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f, + -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f, + -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f, + 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f, + -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f, + -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f, + 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f, + 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f, + -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f, + -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f, + -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f, + -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f, + -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f, + 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f, + -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f, + 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f, + -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f, + 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f, + -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f, + 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f, + -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f, + 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f, + 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f, + 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f, + 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f, + 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f, + 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f, + 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f, + 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f, + 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f, + 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f, + -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f, + 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f, + 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f, + -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f, + 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f, + 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f, + -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f, + -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f, + -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f, + -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f, + 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f, + -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f, + 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f, + 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f, + 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f, + 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f, + 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f, + -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f, + 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f, + -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f, + 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f, + 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f, + 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f, + 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f, + -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f, + -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f, + -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f, + -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f, + -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f, + 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f, + 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f, + -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f, + 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f, + -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f, + -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f, + 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f, + 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f, + -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f, + 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f, + -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f, + 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f, + -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f, + 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f, + 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f, + 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f, + 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f, + 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f, + 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f, + 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f, + 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f, + -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f, + -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f, + -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f, + -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f, + 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f, + -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f, + -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f, + 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f, + -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f, + 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f, + 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f, + -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f, + 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f, + 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f, + -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f, + 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f, + -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f, + 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f, + -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f, + 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f, + -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f, + -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f, + -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f, + -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f, + -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f, + 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f, + 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f, + -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f, + 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f, + 0.018088f, 0.115791f, -0.079165f, 0.139388f, +}; + +static const float weights_layer_2[] = { + 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f, + 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f, + 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f, + -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f, + -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f, + -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f, + 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f, + -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f, + 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f, + 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f, + 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f, + -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f, + 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f, + 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f, + 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f, + -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f, + -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f, + 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f, + 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f, + 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f, + 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f, + 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f, + 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f, + -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f, + 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f, + 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f, + 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f, + 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f, + -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f, + 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f, + 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f, + -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f, + -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f, + 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f, + -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f, + -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f, + 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f, + 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f, + 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f, + -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f, + -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f, + 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f, + -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f, + 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f, + 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f, + -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f, + -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f, + -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f, + -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f, + -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f, + -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f, + -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f, + -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f, + -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f, + 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f, + 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f, + 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f, + -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f, + 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f, + -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f, + 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f, + -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f, + 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f, + -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f, + -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f, + -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f, + 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f, + -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f, + 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f, + -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f, + -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f, + -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f, + -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f, + -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f, + -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f, + -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f, + 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f, + -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f, + -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f, + -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f, + 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f, + 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f, + 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f, + 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f, + 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f, + -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f, + 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f, + 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f, + 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f, + 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f, + 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f, + 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f, + 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f, + -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f, + -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f, + -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f, + 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f, + -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f, + 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f, + -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f, + -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f, + -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f, + 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f, + -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f, + -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f, + 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f, + -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f, + -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f, + 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f, + -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f, + -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f, + 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f, + -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f, + -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f, + -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f, + -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f, + 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f, + -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f, + 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f, + -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f, + -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f, + -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f, + -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f, + -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f, + 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f, + -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f, + -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f, + 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f, + -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f, + -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f, + -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f, + -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f, + -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f, + 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f, + 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f, + -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f, + -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f, + -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f, + 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f, + -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f, + 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f, + -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f, + 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f, + -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f, + 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f, + 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f, + 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f, + 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f, + -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f, + 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f, + -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f, + -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f, + -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f, + 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f, + -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f, + 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f, + -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f, + 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f, + 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f, + 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f, + 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f, + 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f, + 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f, + 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f, + -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f, + -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f, + 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f, + -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f, + 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f, + 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f, + 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f, + -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f, + 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f, + 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f, + -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f, + 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f, + 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f, + -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f, + -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f, + -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f, + 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f, + -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f, + 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f, + -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f, + 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f, + -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f, + -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f, + 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f, + 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f, + 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f, + 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f, + -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f, + 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f, + 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f, + -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f, + 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f, + -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f, + 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f, + -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f, + 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f, + 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f, + 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f, + 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f, + -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f, + 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f, + -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f, + 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f, + 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f, + 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f, + -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f, + 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f, + -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f, + 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f, + -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f, + -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f, + 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f, + -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f, + -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f, + 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f, + -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f, + 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f, + 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f, + -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f, + 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f, + -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f, + 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f, + -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f, + 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f, + -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f, + 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f, + -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f, + 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f, + -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f, + 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f, + -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f, + -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f, + -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f, + 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f, + 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f, + 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f, + -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f, + -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f, + -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f, + -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f, + -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f, + 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f, + -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f, + -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f, + 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f, + 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f, + -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f, + 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f, + -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f, + -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f, + -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f, + 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f, + 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f, + 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f, + 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f, + 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f, + 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f, + 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f, + 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f, + 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f, + -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f, + -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f, + -0.212384f, -0.229157f, -0.283428f, -0.184891f, +}; + +static const float weights_layer_3[] = { + -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f, + 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f, + -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f, + 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f, + -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f, + -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f, + -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f, + -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f, + -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f, + 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f, + -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f, + -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f, + 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f, + -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f, + 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f, + -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f, + 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f, + -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f, + -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f, + 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f, + 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f, + 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f, + 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f, + -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f, + -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f, + -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f, + -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f, + -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f, + -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f, + 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f, + 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f, + 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f, + 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f, + -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f, + 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f, + -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f, + 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f, + 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f, + 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f, + -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f, + -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f, + 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f, + 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f, + -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f, + 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f, + -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f, + -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f, + 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f, + 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f, + 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f, + -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f, + -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f, + -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f, + -0.071383f, -0.075005f, +}; + +static const float weights_layer_4[] = { + -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f, + -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f, + 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f, + -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f, + -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f, + -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f, + -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f, + -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f, + -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f, + -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f, + 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f, + -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f, + -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f, + -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f, + -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f, + -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f, + -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f, + -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f, + 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f, + 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f, + -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f, + -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f, + -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f, + -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f, + -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f, + -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f, + 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f, + 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f, + 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f, + 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f, + 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f, + -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f, + 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f, + 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f, + -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f, + -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f, + 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f, + 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f, + -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f, + 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f, + -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f, + 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f, + -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f, + 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f, + 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f, + 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f, + -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f, + -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f, + -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f, + 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f, + -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f, + -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f, + 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f, + -0.421885f, -0.293573f, +}; + +static const float weights_layer_5[] = { + 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f, + 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f, + 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f, + 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f, + -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f, + 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f, + -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f, + 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f, + 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f, + -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f, + -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f, + 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f, + 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f, + -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f, + -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f, + 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f, + -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f, + -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f, + 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f, + 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f, + -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f, + -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f, + -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f, + 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f, + 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f, + -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f, + 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f, + -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f, + -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f, + -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f, + -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f, + -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f, + -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f, + -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f, + -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f, + 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f, + -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f, + -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f, + -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f, + 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f, + -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f, + 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f, + -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f, + -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f, + 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f, + 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f, + 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f, + -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f, + -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f, + 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f, + 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f, + 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f, + 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f, + 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f, + 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f, + -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f, + 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f, + 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f, + -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f, + -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f, + -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f, + -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f, + 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f, + 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f, + 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f, + -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f, + -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f, + -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f, + 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f, + 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f, + -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f, + 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f, + 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f, + -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f, + -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f, + 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f, + 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f, + -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f, + -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f, + -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f, + -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f, + -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f, + -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f, + 0.565984f, 0.592690f, +}; + +static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) { + float32x4_t sum01 = vaddq_f32(a[0], a[1]); + float32x4_t sum23 = vaddq_f32(a[2], a[3]); + return vaddq_f32(sum01, sum23); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width >= 16); + const int in_size = in_height * in_width; + + do { + const float32x4_t bias_v = vdupq_n_f32(bias[0]); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi_0 = + vld2q_f32(in_ptr2 + in_size + in_stride); + + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0); + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1); + + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0); + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1); + + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0); + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1); + + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0); + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1); + + const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8); + const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8); + const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8); + const float32x4x2_t in1_hi_1 = + vld2q_f32(in_ptr2 + in_size + in_stride + 8); + + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0); + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1); + + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0); + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1); + + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0); + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1); + + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0); + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum0)); + vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1)); + + out_ptr1 += 8; + in_ptr1 += 8 * skip_width; + w += 8 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width == 8); + const int in_size = in_height * in_width; + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride); + + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0); + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1); + + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1); + + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0); + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1); + + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0); + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum)); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 5 && filter_width == 5); + assert(skip_width == 4 && skip_height == 4); + assert(in_width >= 16); + assert(in_channels == 1); + (void)in_channels; + + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *in_ptr0 = *input; + const float *weights_ptr0 = weights; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) }; + + const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0); + const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4); + const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8); + const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12); + const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16); + const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20); + + const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3); + const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3); + const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7); + const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7); + const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11); + const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11); + const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15); + const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15); + const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19); + const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19); + const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23); + const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23); + + const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride); + const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride); + const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride); + const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride); + const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride); + + const float32x4_t in0_4 = vextq_f32( + in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1); + const float32x4_t in1_4 = vextq_f32( + in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1); + const float32x4_t in2_4 = vextq_f32( + in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1); + const float32x4_t in3_4 = vextq_f32( + in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1); + const float32x4_t in4_4 = vextq_f32( + in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1); + + // Kernel row 0. + sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0); + + // Kernel row 1. + sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1); + + // Kernel row 2. + sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0); + + // Kernel row 3. + sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1); + + // Kernel row 4. + sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1); + sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4); + + vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1])); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++output; + ++bias; + weights += 25; + } while (++start_idx < out_channels); +} + +// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_neon( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || + !layer_config->maxpool); + assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); + assert(layer_config->pad == PADDING_VALID); + assert(channel_step == 1); + assert(cstep == layer_config->in_channels * layer_config->out_channels); + + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights_layer_5); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + const float *weights = weights_layer_1; + if (layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) { + weights = weights_layer_2; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[3] + .output_num)) { + weights = weights_layer_3; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[4] + .output_num)) { + weights = weights_layer_4; + } + if (in_width >= 16) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else if (in_width == 8) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, channel_step); + } + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +}
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/encodetxb_neon.c similarity index 100% rename from av1/encoder/arm/neon/encodetxb_neon.c rename to av1/encoder/arm/encodetxb_neon.c
diff --git a/av1/encoder/arm/crc32/hash_arm_crc32.c b/av1/encoder/arm/hash_arm_crc32.c similarity index 98% rename from av1/encoder/arm/crc32/hash_arm_crc32.c rename to av1/encoder/arm/hash_arm_crc32.c index 91fc1e0..6417839 100644 --- a/av1/encoder/arm/crc32/hash_arm_crc32.c +++ b/av1/encoder/arm/hash_arm_crc32.c
@@ -19,6 +19,7 @@ #include <stdint.h> #include "config/aom_config.h" +#include "config/av1_rtcd.h" #define CRC_LOOP(op, crc, type, buf, len) \ while ((len) >= sizeof(type)) { \
diff --git a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/av1/encoder/arm/highbd_fwd_txfm_neon.c similarity index 100% rename from av1/encoder/arm/neon/highbd_fwd_txfm_neon.c rename to av1/encoder/arm/highbd_fwd_txfm_neon.c
diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c new file mode 100644 index 0000000..d067a76 --- /dev/null +++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -0,0 +1,1210 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <stdint.h> + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/encoder/arm/pickrst_neon.h" +#include "av1/encoder/pickrst.h" + +static INLINE void highbd_calc_proj_params_r0_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t h01_lo = vdupq_n_s64(0); + int64x2_t h01_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt0_ptr = flt0; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f0_lo = vsubq_s32(f0_lo, u_lo); + f0_hi = vsubq_s32(f0_hi, u_hi); + f1_lo = vsubq_s32(f1_lo, u_lo); + f1_hi = vsubq_s32(f1_hi, u_hi); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); + h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); + h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); + h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + H[1][0] = H[0][1]; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +static INLINE void highbd_calc_proj_params_r0_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt0_ptr = flt0; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f0_lo = vsubq_s32(f0_lo, u_lo); + f0_hi = vsubq_s32(f0_hi, u_hi); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt0 += flt0_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; +} + +static INLINE void highbd_calc_proj_params_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f1_lo = vsubq_s32(f1_lo, u_lo); + f1_hi = vsubq_s32(f1_hi, u_hi); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } }; + return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx)); +#else + uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)) } }; + return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx)))); +#endif +} + +static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c, + uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b), + vreinterpretq_u8_s16(c) } }; + return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx)); +#else + // This is a specific implementation working only for compute stats with + // wiener_win == 5. + uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)) } }; + uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)), + vreinterpret_u8_s16(vget_low_s16(c)) } }; + return vreinterpretq_s16_u8(vcombine_u8( + vtbl3_u8(table_lo, vget_low_u8(idx)), + vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16))))); +#endif +} + +static INLINE int64_t div_shift_s64(int64_t x, int power) { + return (x < 0 ? x + (1ll << power) - 1 : x) >> power; +} + +// The M matrix is accumulated in a bitdepth-dependent number of steps to +// speed up the computation. This function computes the final M from the +// accumulated (src_s64) and the residual parts (src_s32). It also transposes +// the result as the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int shift) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in a bitdepth-dependent number of +// steps to speed up the computation. This function computes the final H from +// the accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static INLINE void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int stride, int shift) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-right triangle, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } + } + } +} + +// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src); + src += stride; + dst[5] = vld1q_s16(src); + src += stride; + dst[6] = vld1q_s16(src - 1); +} + +static INLINE void highbd_compute_stats_win7_neon( + const uint16_t *dgd, const uint16_t *src, int avg, int width, int height, + int dgd_stride, int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80); + const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96); + const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112); + const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128); + const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144); + const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160); + const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16); + dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16); + + // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays. + // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6); + dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7); + dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8); + dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3); + dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9); + dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4); + dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10); + dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5); + dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly + // packed into a int16x8_t[6] array. This array contains 48 elements of + // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG + // buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16); + dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16); + dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16); + dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift); +} + +// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src - 3); +} + +static void highbd_compute_stats_win5_neon(const uint16_t *dgd, + const uint16_t *src, int avg, + int width, int height, + int dgd_stride, int src_stride, + int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from 5x5 + // matrix. + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33, + 2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35, + 4, 5, 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, + }; + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + + // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays. + // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1); + dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4); + dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16( + tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + bit_depth_shift); +} + +static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, + int width, int height) { + assert(width > 0); + assert(height > 0); + + uint64x2_t sum_u64 = vdupq_n_u64(0); + uint64_t sum = 0; + + int h = height; + do { + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int w = width; + const uint16_t *row = src; + while (w >= 32) { + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + uint16x8_t s2 = vld1q_u16(row + 16); + uint16x8_t s3 = vld1q_u16(row + 24); + + s0 = vaddq_u16(s0, s1); + s2 = vaddq_u16(s2, s3); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s2); + + row += 32; + w -= 32; + } + + if (w >= 16) { + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + + s0 = vaddq_u16(s0, s1); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + + row += 16; + w -= 16; + } + + if (w >= 8) { + uint16x8_t s0 = vld1q_u16(row); + sum_u32[1] = vpadalq_u16(sum_u32[1], s0); + + row += 8; + w -= 8; + } + + if (w >= 4) { + uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0)); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + + row += 4; + w -= 4; + } + + while (w-- > 0) { + sum += *row++; + } + + sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1])); + + src += src_stride; + } while (--h != 0); + + return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width)); +} + +void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + (void)dgd_avg; + (void)src_avg; + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); + + const int wiener_halfwin = wiener_win >> 1; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const int height = v_end - v_start; + const int width = h_end - h_start; + + const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint16_t *src_start = src + h_start + v_start * src_stride; + + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + + uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height); + + if (wiener_win == WIENER_WIN) { + highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } else { + highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } +} + +int64_t av1_highbd_pixel_proj_error_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); + + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t d_s32_lo = vreinterpretq_s32_u32( + vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16( + vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + + int32x4_t v0 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_lo); + int32x4_t v1 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_hi); + + v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0); + v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0); + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d, s))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]); + v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x4_t xq_v = vdupq_n_s32(xq_active); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + do { + const uint16x8_t d0 = vld1q_u16(&dat[j]); + const uint16x8_t s0 = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt[j]); + int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]); + + uint16x8_t d_u16 = vshlq_n_u16(d0, 4); + int32x4_t sub0 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16))); + int32x4_t sub1 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16))); + + int32x4_t v0 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0, + xq_v); + int32x4_t v1 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1, + xq_v); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d0, s0))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4)); + const int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } while (--height != 0); + } else { + do { + int j = 0; + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + + uint16x8_t diff = vabdq_u16(d, s); + uint16x4_t diff_lo = vget_low_u16(diff); + uint16x4_t diff_hi = vget_high_u16(diff); + + uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo); + uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi); + + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo)); + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi)); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); + } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +}
diff --git a/av1/encoder/arm/highbd_pickrst_sve.c b/av1/encoder/arm/highbd_pickrst_sve.c new file mode 100644 index 0000000..4f804c9 --- /dev/null +++ b/av1/encoder/arm/highbd_pickrst_sve.c
@@ -0,0 +1,441 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <string.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/arm/pickrst_sve.h" + +static INLINE uint16_t find_average_sve(const uint16_t *src, int src_stride, + int width, int height) { + uint64x2_t avg_u64 = vdupq_n_u64(0); + uint16x8_t ones = vdupq_n_u16(1); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8); + + int h = height; + do { + int j = width; + const uint16_t *src_ptr = src; + while (j > 8) { + uint16x8_t s = vld1q_u16(src_ptr); + avg_u64 = aom_udotq_u16(avg_u64, s, ones); + + j -= 8; + src_ptr += 8; + } + uint16x8_t s_end = svget_neonq_u16(svld1_u16(pattern, src_ptr)); + avg_u64 = aom_udotq_u16(avg_u64, s_end, ones); + + src += src_stride; + } while (--h != 0); + return (uint16_t)(vaddvq_u64(avg_u64) / (width * height)); +} + +static INLINE void compute_sub_avg(const uint16_t *buf, int buf_stride, + int16_t avg, int16_t *buf_avg, + int buf_avg_stride, int width, int height) { + uint16x8_t avg_u16 = vdupq_n_u16(avg); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8); + + uint16x8_t avg_end = svget_neonq_u16(svdup_n_u16_z(pattern, avg)); + + do { + int j = width; + const uint16_t *buf_ptr = buf; + int16_t *buf_avg_ptr = buf_avg; + while (j > 8) { + uint16x8_t d = vld1q_u16(buf_ptr); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d, avg_u16))); + + j -= 8; + buf_ptr += 8; + buf_avg_ptr += 8; + } + uint16x8_t d_end = svget_neonq_u16(svld1_u16(pattern, buf_ptr)); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d_end, avg_end))); + + buf += buf_stride; + buf_avg += buf_avg_stride; + } while (--height > 0); +} + +static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp, + const int wiener_win2, + const int divider) { + for (int i = 0; i < wiener_win2 - 2; i = i + 2) { + // Transpose the first 2x2 square. It needs a special case as the element + // of the bottom left is on the diagonal. + int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1); + int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1); + + int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); + + vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); + vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row); + + // Transpose and store all the remaining 2x2 squares of the line. + for (int j = i + 3; j < wiener_win2; j = j + 2) { + row0 = vld1q_s64(H_tmp + i * wiener_win2 + j); + row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j); + + int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); + int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); + + vst1q_s64(H_tmp + (j + 0) * wiener_win2 + i, tr_row0); + vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1); + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; i++) { + H[i] += H_tmp[i] / divider; + } +} + +// Transpose the matrix that has just been computed and accumulate it in M. +static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn, + const int wiener_win, const int divider) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *M++ += (int64_t)(M_trn[tr_idx] / divider); + } + } +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 7 * 7. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 49 * 49. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void highbd_compute_stats_win7_sve( + int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) { + const int wiener_win = 7; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[49]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[49 * 49]; + memset(H_tmp, 0, sizeof(H_tmp)); + + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[7]; + load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win7(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j < width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[7]; + load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + // Process remaining columns using a predicate to discard excess elements. + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[7]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + dgd0[5] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0)); + dgd0[6] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + dgd_avg += dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 7, bit_depth_divider); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider); +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 5 * 5. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 25 * 25. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void highbd_compute_stats_win5_sve( + int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, int64_t *H, int bit_depth_divider) { + const int wiener_win = 5; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[25]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[25 * 25]; + memset(H_tmp, 0, sizeof(H_tmp)); + + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[5]; + load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win5(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j < width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[5]; + load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4]); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + // Process remaining columns using a predicate to discard excess elements. + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[5]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + dgd_avg += dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 5, bit_depth_divider); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, bit_depth_divider); +} + +void av1_compute_stats_highbd_sve(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const uint16_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + memset(M, 0, sizeof(*M) * wiener_win * wiener_win); + + const uint16_t avg = find_average_sve(dgd_start, dgd_stride, width, height); + + // dgd_avg and src_avg have been memset to zero before calling this function + // so round up the stride to the next multiple of 8 so that we don't have to + // worry about a tail loop when computing M. + const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; + const int src_avg_stride = (width & ~7) + 8; + + // Compute (dgd - avg) and store it in dgd_avg. + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, + width + 2 * wiener_halfwin, height + 2 * wiener_halfwin); + + // Compute (src - avg), downsample if necessary and store in src-avg. + const uint16_t *src_start = src + h_start + v_start * src_stride; + compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width, + height); + + if (wiener_win == WIENER_WIN) { + highbd_compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, + src_avg_stride, width, height, M, H, + bit_depth_divider); + } else { + highbd_compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, + src_avg_stride, width, height, M, H, + bit_depth_divider); + } +}
diff --git a/av1/encoder/arm/neon/highbd_rdopt_neon.c b/av1/encoder/arm/highbd_rdopt_neon.c similarity index 100% rename from av1/encoder/arm/neon/highbd_rdopt_neon.c rename to av1/encoder/arm/highbd_rdopt_neon.c
diff --git a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/av1/encoder/arm/highbd_temporal_filter_neon.c similarity index 100% rename from av1/encoder/arm/neon/highbd_temporal_filter_neon.c rename to av1/encoder/arm/highbd_temporal_filter_neon.c
diff --git a/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/av1/encoder/arm/hybrid_fwd_txfm_neon.c similarity index 98% rename from av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c rename to av1/encoder/arm/hybrid_fwd_txfm_neon.c index 6cf835a..1d83bec 100644 --- a/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c +++ b/av1/encoder/arm/hybrid_fwd_txfm_neon.c
@@ -12,6 +12,7 @@ #include <arm_neon.h> #include "aom_dsp/txfm_common.h" +#include "config/av1_rtcd.h" static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) { int32x4x2_t b0 =
diff --git a/av1/encoder/arm/neon/ml_neon.c b/av1/encoder/arm/ml_neon.c similarity index 100% rename from av1/encoder/arm/neon/ml_neon.c rename to av1/encoder/arm/ml_neon.c
diff --git a/av1/encoder/arm/neon/av1_error_neon.c b/av1/encoder/arm/neon/av1_error_neon.c deleted file mode 100644 index 84c8967..0000000 --- a/av1/encoder/arm/neon/av1_error_neon.c +++ /dev/null
@@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/arm/mem_neon.h" - -int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - int64x2_t error = vdupq_n_s64(0); - int64x2_t sqcoeff = vdupq_n_s64(0); - - assert(block_size >= 8); - assert((block_size % 8) == 0); - - do { - const int16x8_t c = load_tran_low_to_s16q(coeff); - const int16x8_t d = load_tran_low_to_s16q(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before - // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - - const int16x4_t coeff_lo = vget_low_s16(c); - const int16x4_t coeff_hi = vget_high_s16(c); - const int32x4_t sqcoeff0 = vmull_s16(coeff_lo, coeff_lo); - const int32x4_t sqcoeff1 = vmlal_s16(sqcoeff0, coeff_hi, coeff_hi); - const int64x2_t sqcoeff2 = - vaddl_s32(vget_low_s32(sqcoeff1), vget_high_s32(sqcoeff1)); - sqcoeff = vaddq_s64(sqcoeff, sqcoeff2); - - coeff += 8; - dqcoeff += 8; - block_size -= 8; - } while (block_size != 0); - -#if AOM_ARCH_AARCH64 - *ssz = vaddvq_s64(sqcoeff); - return vaddvq_s64(error); -#else - *ssz = vgetq_lane_s64(sqcoeff, 0) + vgetq_lane_s64(sqcoeff, 1); - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); -#endif -} - -int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, - intptr_t block_size) { - int64x2_t error = vdupq_n_s64(0); - - assert(block_size >= 8); - assert((block_size % 8) == 0); - - do { - const int16x8_t c = vld1q_s16(coeff); - const int16x8_t d = vld1q_s16(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before - // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - coeff += 8; - dqcoeff += 8; - block_size -= 8; - } while (block_size != 0); - - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); -}
diff --git a/av1/encoder/arm/neon/highbd_pickrst_neon.c b/av1/encoder/arm/neon/highbd_pickrst_neon.c deleted file mode 100644 index 76e0344..0000000 --- a/av1/encoder/arm/neon/highbd_pickrst_neon.c +++ /dev/null
@@ -1,741 +0,0 @@ -/* - * Copyright (c) 2023, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> -#include <assert.h> -#include <stdint.h> - -#include "aom_dsp/arm/mem_neon.h" -#include "aom_dsp/arm/sum_neon.h" -#include "aom_dsp/arm/transpose_neon.h" -#include "av1/encoder/arm/neon/pickrst_neon.h" -#include "av1/encoder/pickrst.h" - -static INLINE void highbd_calc_proj_params_r0_r1_neon( - const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, - int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { - assert(width % 8 == 0); - const int size = width * height; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); - - int64x2_t h00_lo = vdupq_n_s64(0); - int64x2_t h00_hi = vdupq_n_s64(0); - int64x2_t h11_lo = vdupq_n_s64(0); - int64x2_t h11_hi = vdupq_n_s64(0); - int64x2_t h01_lo = vdupq_n_s64(0); - int64x2_t h01_hi = vdupq_n_s64(0); - int64x2_t c0_lo = vdupq_n_s64(0); - int64x2_t c0_hi = vdupq_n_s64(0); - int64x2_t c1_lo = vdupq_n_s64(0); - int64x2_t c1_hi = vdupq_n_s64(0); - - do { - const uint16_t *src_ptr = src; - const uint16_t *dat_ptr = dat; - int32_t *flt0_ptr = flt0; - int32_t *flt1_ptr = flt1; - int w = width; - - do { - uint16x8_t s = vld1q_u16(src_ptr); - uint16x8_t d = vld1q_u16(dat_ptr); - int32x4_t f0_lo = vld1q_s32(flt0_ptr); - int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); - int32x4_t f1_lo = vld1q_s32(flt1_ptr); - int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); - - int32x4_t u_lo = - vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); - int32x4_t u_hi = vreinterpretq_s32_u32( - vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); - int32x4_t s_lo = - vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); - int32x4_t s_hi = vreinterpretq_s32_u32( - vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); - s_lo = vsubq_s32(s_lo, u_lo); - s_hi = vsubq_s32(s_hi, u_hi); - - f0_lo = vsubq_s32(f0_lo, u_lo); - f0_hi = vsubq_s32(f0_hi, u_hi); - f1_lo = vsubq_s32(f1_lo, u_lo); - f1_hi = vsubq_s32(f1_hi, u_hi); - - h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); - h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); - h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); - h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); - - h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); - h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); - h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); - h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); - - h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); - h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); - h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); - h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); - - c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); - c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); - c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); - c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); - - c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); - c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); - c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); - c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); - - src_ptr += 8; - dat_ptr += 8; - flt0_ptr += 8; - flt1_ptr += 8; - w -= 8; - } while (w != 0); - - src += src_stride; - dat += dat_stride; - flt0 += flt0_stride; - flt1 += flt1_stride; - } while (--height != 0); - - H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; - H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; - H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; - H[1][0] = H[0][1]; - C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; - C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; -} - -static INLINE void highbd_calc_proj_params_r0_neon( - const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, - int64_t H[2][2], int64_t C[2]) { - assert(width % 8 == 0); - const int size = width * height; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); - - int64x2_t h00_lo = vdupq_n_s64(0); - int64x2_t h00_hi = vdupq_n_s64(0); - int64x2_t c0_lo = vdupq_n_s64(0); - int64x2_t c0_hi = vdupq_n_s64(0); - - do { - const uint16_t *src_ptr = src; - const uint16_t *dat_ptr = dat; - int32_t *flt0_ptr = flt0; - int w = width; - - do { - uint16x8_t s = vld1q_u16(src_ptr); - uint16x8_t d = vld1q_u16(dat_ptr); - int32x4_t f0_lo = vld1q_s32(flt0_ptr); - int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); - - int32x4_t u_lo = - vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); - int32x4_t u_hi = vreinterpretq_s32_u32( - vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); - int32x4_t s_lo = - vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); - int32x4_t s_hi = vreinterpretq_s32_u32( - vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); - s_lo = vsubq_s32(s_lo, u_lo); - s_hi = vsubq_s32(s_hi, u_hi); - - f0_lo = vsubq_s32(f0_lo, u_lo); - f0_hi = vsubq_s32(f0_hi, u_hi); - - h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); - h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); - h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); - h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); - - c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); - c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); - c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); - c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); - - src_ptr += 8; - dat_ptr += 8; - flt0_ptr += 8; - w -= 8; - } while (w != 0); - - src += src_stride; - dat += dat_stride; - flt0 += flt0_stride; - } while (--height != 0); - - H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; - C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; -} - -static INLINE void highbd_calc_proj_params_r1_neon( - const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, - int64_t H[2][2], int64_t C[2]) { - assert(width % 8 == 0); - const int size = width * height; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); - - int64x2_t h11_lo = vdupq_n_s64(0); - int64x2_t h11_hi = vdupq_n_s64(0); - int64x2_t c1_lo = vdupq_n_s64(0); - int64x2_t c1_hi = vdupq_n_s64(0); - - do { - const uint16_t *src_ptr = src; - const uint16_t *dat_ptr = dat; - int32_t *flt1_ptr = flt1; - int w = width; - - do { - uint16x8_t s = vld1q_u16(src_ptr); - uint16x8_t d = vld1q_u16(dat_ptr); - int32x4_t f1_lo = vld1q_s32(flt1_ptr); - int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); - - int32x4_t u_lo = - vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); - int32x4_t u_hi = vreinterpretq_s32_u32( - vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); - int32x4_t s_lo = - vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); - int32x4_t s_hi = vreinterpretq_s32_u32( - vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); - s_lo = vsubq_s32(s_lo, u_lo); - s_hi = vsubq_s32(s_hi, u_hi); - - f1_lo = vsubq_s32(f1_lo, u_lo); - f1_hi = vsubq_s32(f1_hi, u_hi); - - h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); - h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); - h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); - h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); - - c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); - c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); - c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); - c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); - - src_ptr += 8; - dat_ptr += 8; - flt1_ptr += 8; - w -= 8; - } while (w != 0); - - src += src_stride; - dat += dat_stride; - flt1 += flt1_stride; - } while (--height != 0); - - H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; - C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; -} - -// The function calls 3 subfunctions for the following cases : -// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements -// of C and H need to be computed. -// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are -// non-zero and need to be computed. -// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are -// non-zero and need to be computed. -void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, - int height, int src_stride, - const uint8_t *dat8, int dat_stride, - int32_t *flt0, int flt0_stride, - int32_t *flt1, int flt1_stride, - int64_t H[2][2], int64_t C[2], - const sgr_params_type *params) { - if ((params->r[0] > 0) && (params->r[1] > 0)) { - highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, - dat_stride, flt0, flt0_stride, flt1, - flt1_stride, H, C); - } else if (params->r[0] > 0) { - highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, - dat_stride, flt0, flt0_stride, H, C); - } else if (params->r[1] > 0) { - highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, - dat_stride, flt1, flt1_stride, H, C); - } -} - -static int16_t highbd_find_average_neon(const int16_t *src, int src_stride, - int width, int height) { - assert(width > 0); - assert(height > 0); - - int64x2_t sum_s64 = vdupq_n_s64(0); - int64_t sum = 0; - - int h = height; - do { - int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; - - int w = width; - const int16_t *row = src; - while (w >= 32) { - int16x8_t s0 = vld1q_s16(row + 0); - int16x8_t s1 = vld1q_s16(row + 8); - int16x8_t s2 = vld1q_s16(row + 16); - int16x8_t s3 = vld1q_s16(row + 24); - - s0 = vaddq_s16(s0, s1); - s2 = vaddq_s16(s2, s3); - sum_s32[0] = vpadalq_s16(sum_s32[0], s0); - sum_s32[1] = vpadalq_s16(sum_s32[1], s2); - - row += 32; - w -= 32; - } - - if (w >= 16) { - int16x8_t s0 = vld1q_s16(row + 0); - int16x8_t s1 = vld1q_s16(row + 8); - - s0 = vaddq_s16(s0, s1); - sum_s32[0] = vpadalq_s16(sum_s32[0], s0); - - row += 16; - w -= 16; - } - - if (w >= 8) { - int16x8_t s0 = vld1q_s16(row); - sum_s32[1] = vpadalq_s16(sum_s32[1], s0); - - row += 8; - w -= 8; - } - - if (w >= 4) { - int16x8_t s0 = vcombine_s16(vld1_s16(row), vdup_n_s16(0)); - sum_s32[0] = vpadalq_s16(sum_s32[0], s0); - - row += 4; - w -= 4; - } - - while (w-- > 0) { - sum += *row++; - } - - sum_s64 = vpadalq_s32(sum_s64, vaddq_s32(sum_s32[0], sum_s32[1])); - - src += src_stride; - } while (--h != 0); - return (int16_t)((horizontal_add_s64x2(sum_s64) + sum) / (height * width)); -} - -static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, - const int wiener_win, - const int wiener_win2) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = row0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]), - vget_high_s16(dgd[row1])); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } -} - -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 7 * 7. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 49 * 49. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void highbd_compute_stats_win7_neon( - const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride, - int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) { - const int wiener_win = 7; - const int wiener_win2 = wiener_win * wiener_win; - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[49]; - memset(M_trn, 0, sizeof(M_trn)); - - int16x8_t vavg = vdupq_n_s16(avg); - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = - vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg); - int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg); - - // Compute all the elements of one row of M. - compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src[j] - avg; - int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg; - int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg; - int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg; - int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg; - int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg; - int16_t d5 = dgd[row * dgd_stride + 5 + j] - avg; - int16_t d6 = dgd[row * dgd_stride + 6 + j] - avg; - - M_trn[row * wiener_win + 0] += d0 * s; - M_trn[row * wiener_win + 1] += d1 * s; - M_trn[row * wiener_win + 2] += d2 * s; - M_trn[row * wiener_win + 3] += d3 * s; - M_trn[row * wiener_win + 4] += d4 * s; - M_trn[row * wiener_win + 5] += d5 * s; - M_trn[row * wiener_win + 6] += d6 * s; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - int16x8_t dgd0[7]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg); - dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[7]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg); - dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - if (j < width) { - // Process remaining columns using a mask to discard excess elements. - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - dgd0[4] = vandq_s16(dgd0[4], mask); - dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg); - dgd0[5] = vandq_s16(dgd0[5], mask); - dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg); - dgd0[6] = vandq_s16(dgd0[6], mask); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[7]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg); - dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd += dgd_stride; - src += src_stride; - } while (--height != 0); - - // Transpose M_trn. - transpose_M_win7(M, M_trn, 7); - - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); - - // Scaling the results. - uint8_t bit_depth_divider = 1; - if (bit_depth == AOM_BITS_12) { - bit_depth_divider = 16; - } else if (bit_depth == AOM_BITS_10) { - bit_depth_divider = 4; - } - - for (int i = 0; i < wiener_win2; ++i) { - M[i] /= bit_depth_divider; - for (int j = 0; j < wiener_win2; ++j) { - H[i * wiener_win2 + j] /= bit_depth_divider; - } - } -} - -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 5 * 5. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 25 * 25. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void highbd_compute_stats_win5_neon( - const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride, - int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) { - const int wiener_win = 5; - const int wiener_win2 = wiener_win * wiener_win; - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[25]; - memset(M_trn, 0, sizeof(M_trn)); - - int16x8_t vavg = vdupq_n_s16(avg); - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = - vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg); - int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg); - - // Compute all the elements of one row of M. - compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src[j] - avg; - int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg; - int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg; - int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg; - int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg; - int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg; - - M_trn[row * wiener_win + 0] += d0 * s; - M_trn[row * wiener_win + 1] += d1 * s; - M_trn[row * wiener_win + 2] += d2 * s; - M_trn[row * wiener_win + 3] += d3 * s; - M_trn[row * wiener_win + 4] += d4 * s; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[5]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - if (j < width) { - // Process remaining columns using a mask to discard excess elements. - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - dgd0[4] = vandq_s16(dgd0[4], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[5]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd += dgd_stride; - src += src_stride; - } while (--height != 0); - - // Transpose M_trn. - transpose_M_win5(M, M_trn, 5); - - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); - - // Scaling the results. - uint8_t bit_depth_divider = 1; - if (bit_depth == AOM_BITS_12) { - bit_depth_divider = 16; - } else if (bit_depth == AOM_BITS_10) { - bit_depth_divider = 4; - } - - for (int i = 0; i < wiener_win2; ++i) { - M[i] /= bit_depth_divider; - for (int j = 0; j < wiener_win2; ++j) { - H[i * wiener_win2 + j] /= bit_depth_divider; - } - } -} - -void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, - aom_bit_depth_t bit_depth) { - assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); - - const int wiener_halfwin = wiener_win >> 1; - const int wiener_win2 = wiener_win * wiener_win; - memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); - - const int16_t *src = (const int16_t *)CONVERT_TO_SHORTPTR(src8); - const int16_t *dgd = (const int16_t *)CONVERT_TO_SHORTPTR(dgd8); - const int height = v_end - v_start; - const int width = h_end - h_start; - const int vert_offset = v_start - wiener_halfwin; - const int horiz_offset = h_start - wiener_halfwin; - - int16_t avg = highbd_find_average_neon(dgd + v_start * dgd_stride + h_start, - dgd_stride, width, height); - - src += v_start * src_stride + h_start; - dgd += vert_offset * dgd_stride + horiz_offset; - - if (wiener_win == WIENER_WIN) { - highbd_compute_stats_win7_neon(dgd, dgd_stride, src, src_stride, width, - height, M, H, avg, bit_depth); - } else { - highbd_compute_stats_win5_neon(dgd, dgd_stride, src, src_stride, width, - height, M, H, avg, bit_depth); - } -}
diff --git a/av1/encoder/arm/neon/pickrst_neon.c b/av1/encoder/arm/neon/pickrst_neon.c deleted file mode 100644 index 6227028..0000000 --- a/av1/encoder/arm/neon/pickrst_neon.c +++ /dev/null
@@ -1,1261 +0,0 @@ -/* - * Copyright (c) 2020, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "aom_dsp/arm/sum_neon.h" -#include "aom_dsp/arm/transpose_neon.h" -#include "av1/common/restoration.h" -#include "av1/encoder/arm/neon/pickrst_neon.h" -#include "av1/encoder/pickrst.h" - -int64_t av1_lowbd_pixel_proj_error_neon( - const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, - int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { - int i, j, k; - const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; - const int32x4_t zero = vdupq_n_s32(0); - uint64x2_t sum64 = vreinterpretq_u64_s32(zero); - const uint8_t *src = src8; - const uint8_t *dat = dat8; - - int64_t err = 0; - if (params->r[0] > 0 && params->r[1] > 0) { - for (i = 0; i < height; ++i) { - int32x4_t err0 = zero; - for (j = 0; j <= width - 8; j += 8) { - const uint8x8_t d0 = vld1_u8(&dat[j]); - const uint8x8_t s0 = vld1_u8(&src[j]); - const int16x8_t flt0_16b = - vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])), - vqmovn_s32(vld1q_s32(&flt0[j + 4]))); - const int16x8_t flt1_16b = - vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])), - vqmovn_s32(vld1q_s32(&flt1[j + 4]))); - const int16x8_t u0 = - vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS)); - const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0); - const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0); - const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u); - const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u); - const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u); - const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u); - - int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]); - v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]); - int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]); - v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]); - const int16x4_t vr0 = vqrshrn_n_s32(v0, 11); - const int16x4_t vr1 = vqrshrn_n_s32(v1, 11); - const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1), - vreinterpretq_s16_u16(vsubl_u8(d0, s0))); - const int16x4_t e0_lo = vget_low_s16(e0); - const int16x4_t e0_hi = vget_high_s16(e0); - err0 = vmlal_s16(err0, e0_lo, e0_lo); - err0 = vmlal_s16(err0, e0_hi, e0_hi); - } - for (k = j; k < width; ++k) { - const int32_t u = dat[k] << SGRPROJ_RST_BITS; - int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); - const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k]; - err += e * e; - } - dat += dat_stride; - src += src_stride; - flt0 += flt0_stride; - flt1 += flt1_stride; - sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0)); - } - - } else if (params->r[0] > 0 || params->r[1] > 0) { - const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; - const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; - const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; - for (i = 0; i < height; ++i) { - int32x4_t err0 = zero; - for (j = 0; j <= width - 8; j += 8) { - const uint8x8_t d0 = vld1_u8(&dat[j]); - const uint8x8_t s0 = vld1_u8(&src[j]); - const uint16x8_t d0s0 = vsubl_u8(d0, s0); - const uint16x8x2_t d0w = - vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero)); - - const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]); - const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]); - - int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active); - v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]), - xq_active * (1 << SGRPROJ_RST_BITS)); - int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active); - v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]), - xq_active * (1 << SGRPROJ_RST_BITS)); - const int16x4_t vr0 = vqrshrn_n_s32(v0, 11); - const int16x4_t vr1 = vqrshrn_n_s32(v1, 11); - const int16x8_t e0 = - vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0)); - const int16x4_t e0_lo = vget_low_s16(e0); - const int16x4_t e0_hi = vget_high_s16(e0); - err0 = vmlal_s16(err0, e0_lo, e0_lo); - err0 = vmlal_s16(err0, e0_hi, e0_hi); - } - for (k = j; k < width; ++k) { - const int32_t u = dat[k] << SGRPROJ_RST_BITS; - int32_t v = xq_active * (flt[k] - u); - const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; - } - dat += dat_stride; - src += src_stride; - flt += flt_stride; - sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0)); - } - } else { - uint32x4_t err0 = vreinterpretq_u32_s32(zero); - for (i = 0; i < height; ++i) { - for (j = 0; j <= width - 16; j += 16) { - const uint8x16_t d = vld1q_u8(&dat[j]); - const uint8x16_t s = vld1q_u8(&src[j]); - const uint8x16_t diff = vabdq_u8(d, s); - const uint8x8_t diff0 = vget_low_u8(diff); - const uint8x8_t diff1 = vget_high_u8(diff); - err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0)); - err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1)); - } - for (k = j; k < width; ++k) { - const int32_t e = dat[k] - src[k]; - err += e * e; - } - dat += dat_stride; - src += src_stride; - } - sum64 = vpaddlq_u32(err0); - } -#if AOM_ARCH_AARCH64 - err += vaddvq_u64(sum64); -#else - err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0); -#endif // AOM_ARCH_AARCH64 - return err; -} - -static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride, - int width, int height) { - uint64_t sum = 0; - - if (width >= 16) { - int h = 0; - // We can accumulate up to 257 8-bit values in a 16-bit value, given - // that each 16-bit vector has 8 elements, that means we can process up to - // int(257*8/width) rows before we need to widen to 32-bit vector - // elements. - int h_overflow = 257 * 8 / width; - int h_limit = height > h_overflow ? h_overflow : height; - uint32x4_t avg_u32 = vdupq_n_u32(0); - do { - uint16x8_t avg_u16 = vdupq_n_u16(0); - do { - int j = width; - const uint8_t *src_ptr = src; - do { - uint8x16_t s = vld1q_u8(src_ptr); - avg_u16 = vpadalq_u8(avg_u16, s); - j -= 16; - src_ptr += 16; - } while (j >= 16); - if (j >= 8) { - uint8x8_t s = vld1_u8(src_ptr); - avg_u16 = vaddw_u8(avg_u16, s); - j -= 8; - src_ptr += 8; - } - // Scalar tail case. - while (j > 0) { - sum += src[width - j]; - j--; - } - src += src_stride; - } while (++h < h_limit); - avg_u32 = vpadalq_u16(avg_u32, avg_u16); - - h_limit += h_overflow; - h_limit = height > h_overflow ? h_overflow : height; - } while (h < height); - return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) / - (width * height)); - } - if (width >= 8) { - int h = 0; - // We can accumulate up to 257 8-bit values in a 16-bit value, given - // that each 16-bit vector has 4 elements, that means we can process up to - // int(257*4/width) rows before we need to widen to 32-bit vector - // elements. - int h_overflow = 257 * 4 / width; - int h_limit = height > h_overflow ? h_overflow : height; - uint32x2_t avg_u32 = vdup_n_u32(0); - do { - uint16x4_t avg_u16 = vdup_n_u16(0); - do { - int j = width; - const uint8_t *src_ptr = src; - uint8x8_t s = vld1_u8(src_ptr); - avg_u16 = vpadal_u8(avg_u16, s); - j -= 8; - src_ptr += 8; - // Scalar tail case. - while (j > 0) { - sum += src[width - j]; - j--; - } - src += src_stride; - } while (++h < h_limit); - avg_u32 = vpadal_u16(avg_u32, avg_u16); - - h_limit += h_overflow; - h_limit = height > h_overflow ? h_overflow : height; - } while (h < height); - return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) / - (width * height)); - } - int i = height; - do { - int j = 0; - do { - sum += src[j]; - } while (++j < width); - src += src_stride; - } while (--i != 0); - return (uint8_t)(sum / (width * height)); -} - -static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, - int16_t *buf_avg, int buf_avg_stride, - int width, int height, - int downsample_factor) { - uint8x8_t avg_u8 = vdup_n_u8(avg); - - if (width > 8) { - int i = 0; - do { - int j = width; - const uint8_t *buf_ptr = buf; - int16_t *buf_avg_ptr = buf_avg; - do { - uint8x8_t d = vld1_u8(buf_ptr); - vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); - - j -= 8; - buf_ptr += 8; - buf_avg_ptr += 8; - } while (j >= 8); - while (j > 0) { - *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg; - buf_avg_ptr++; - j--; - } - buf += buf_stride; - buf_avg += buf_avg_stride; - i += downsample_factor; - } while (i < height); - } else { - // For width < 8, don't use Neon. - for (int i = 0; i < height; i = i + downsample_factor) { - for (int j = 0; j < width; j++) { - buf_avg[j] = (int16_t)buf[j] - (int16_t)avg; - } - buf += buf_stride; - buf_avg += buf_avg_stride; - } - } -} - -static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, - const int wiener_win, - const int wiener_win2, int32x4_t df_s32) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = row0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]), - vget_high_s16(dgd[row1])); - auto_cov = vshlq_s32(auto_cov, df_s32); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } -} - -static INLINE void compute_H_one_col_last_row(int16x8_t *dgd, int col, - int64_t *H, const int wiener_win, - const int wiener_win2, - int last_row_df) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = row0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]), - vget_high_s16(dgd[row1])); - auto_cov = vmulq_n_s32(auto_cov, last_row_df); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } -} - -// When we load 8 values of int16_t type and need less than 8 values for -// processing, the below mask is used to make the extra values zero. -const int16_t av1_neon_mask_16bit[16] = { - -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 7 * 7. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 49 * 49. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void compute_stats_win7_neon(int16_t *dgd_avg, int dgd_avg_stride, - int16_t *src_avg, int src_avg_stride, - int width, int v_start, int v_end, - int64_t *M, int64_t *H, - int downsample_factor, - int last_row_downsample_factor) { - const int wiener_win = 7; - const int wiener_win2 = wiener_win * wiener_win; - // The downsample factor can be either 1 or 4, so instead of multiplying the - // values by 1 or 4, we can left shift by 0 or 2 respectively, which is - // faster. (This doesn't apply to the last row where we can scale the values - // by 1, 2 or 3, so we keep the multiplication). - const int downsample_shift = downsample_factor >> 1; - const int16x8_t df_s16 = vdupq_n_s16(downsample_shift); - const int32x4_t df_s32 = vdupq_n_s32(downsample_shift); - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[49]; - memset(M_trn, 0, sizeof(M_trn)); - - int h = v_start; - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src and scale based on downsampling factor. - int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16); - - // Compute all the elements of one row of M. - compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j] * downsample_factor; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j]; - int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j]; - - M_trn[row * wiener_win + 0] += d0 * s; - M_trn[row * wiener_win + 1] += d1 * s; - M_trn[row * wiener_win + 2] += d2 * s; - M_trn[row * wiener_win + 3] += d3 * s; - M_trn[row * wiener_win + 4] += d4 * s; - M_trn[row * wiener_win + 5] += d5 * s; - M_trn[row * wiener_win + 6] += d6 * s; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vshlq_s16(dgd1[5], df_s16); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vshlq_s16(dgd1[6], df_s16); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - if (j < width) { - // Process remaining columns using a mask to discard excess elements. - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[5] = vandq_s16(dgd0[5], mask); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - dgd0[6] = vandq_s16(dgd0[6], mask); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vshlq_s16(dgd1[5], df_s16); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vshlq_s16(dgd1[6], df_s16); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd_avg += downsample_factor * dgd_avg_stride; - src_avg += src_avg_stride; - h += downsample_factor; - } while (h <= v_end - downsample_factor); - - if (h < v_end) { - // The last row is scaled by a different downsample factor, so process - // separately. - - // Cross-correlation (M). - for (int row = 0; row < 7; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src vector and scale based on downsampling factor. - int16x8_t s = - vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor); - - // Compute all the elements of one row of M. - compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j]; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j]; - int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j]; - - M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 5] += d5 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 6] += d6 * s * last_row_downsample_factor; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - int col0 = 0; - do { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } while (++col0 < wiener_win); - j += 8; - } - - // Process remaining columns using a mask to discard excess elements. - if (j < width) { - int col0 = 0; - do { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[5] = vandq_s16(dgd0[5], mask); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - dgd0[6] = vandq_s16(dgd0[6], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } while (++col0 < wiener_win); - } - } - - // Transpose M_trn. - transpose_M_win7(M, M_trn, 7); - - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); -} - -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 5 * 5. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 25 * 25. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void compute_stats_win5_neon(int16_t *dgd_avg, int dgd_avg_stride, - int16_t *src_avg, int src_avg_stride, - int width, int v_start, int v_end, - int64_t *M, int64_t *H, - int downsample_factor, - int last_row_downsample_factor) { - const int wiener_win = 5; - const int wiener_win2 = wiener_win * wiener_win; - // The downsample factor can be either 1 or 4, so instead of multiplying the - // values by 1 or 4, we can left shift by 0 or 2 respectively, which is - // faster. (This doesn't apply to the last row where we can scale the values - // by 1, 2 or 3, so we keep the multiplication). - const int downsample_shift = downsample_factor >> 1; - const int16x8_t df_s16 = vdupq_n_s16(downsample_shift); - const int32x4_t df_s32 = vdupq_n_s32(downsample_shift); - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[25]; - memset(M_trn, 0, sizeof(M_trn)); - - int h = v_start; - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src vector and scale based on downsampling factor. - int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16); - - // Compute all the elements of one row of M. - compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j]; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - - M_trn[row * wiener_win + 0] += d0 * s * downsample_factor; - M_trn[row * wiener_win + 1] += d1 * s * downsample_factor; - M_trn[row * wiener_win + 2] += d2 * s * downsample_factor; - M_trn[row * wiener_win + 3] += d3 * s * downsample_factor; - M_trn[row * wiener_win + 4] += d4 * s * downsample_factor; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - // Process remaining columns using a mask to discard excess elements. - if (j < width) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd_avg += downsample_factor * dgd_avg_stride; - src_avg += src_avg_stride; - h += downsample_factor; - } while (h <= v_end - downsample_factor); - - if (h < v_end) { - // The last row is scaled by a different downsample factor, so process - // separately. - - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src vector and scale based on downsampling factor. - int16x8_t s = - vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor); - - // Compute all the elements of one row of M. - compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j]; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - - M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - // Process remaining columns using a mask to discard excess elements. - if (j < width) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - } - - // Transpose M_trn. - transpose_M_win5(M, M_trn, 5); - - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); -} - -void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int16_t *dgd_avg, - int16_t *src_avg, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, - int use_downsampled_wiener_stats) { - assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); - - const int wiener_win2 = wiener_win * wiener_win; - const int wiener_halfwin = wiener_win >> 1; - const int32_t width = h_end - h_start; - const int32_t height = v_end - v_start; - const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; - memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); - - uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); - assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); - int downsample_factor = - use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; - - int dgd_avg_stride = width + 2 * wiener_halfwin; - int src_avg_stride = width; - - // Compute (dgd - avg) and store it in dgd_avg. - // The wiener window will slide along the dgd frame, centered on each pixel. - // For the top left pixel and all the pixels on the side of the frame this - // means half of the window will be outside of the frame. As such the actual - // buffer that we need to subtract the avg from will be 2 * wiener_halfwin - // wider and 2 * wiener_halfwin higher than the original dgd buffer. - const int vert_offset = v_start - wiener_halfwin; - const int horiz_offset = h_start - wiener_halfwin; - const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; - compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, - width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); - - // Compute (src - avg), downsample if necessary and store in src-avg. - const uint8_t *src_start = src + h_start + v_start * src_stride; - compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, - src_avg_stride, width, height, downsample_factor); - - // Since the height is not necessarily a multiple of the downsample factor, - // the last line of src will be scaled according to how many rows remain. - int last_row_downsample_factor = - use_downsampled_wiener_stats ? height % downsample_factor : 1; - - if (wiener_win == WIENER_WIN) { - compute_stats_win7_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, - width, v_start, v_end, M, H, downsample_factor, - last_row_downsample_factor); - } else { - compute_stats_win5_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, - width, v_start, v_end, M, H, downsample_factor, - last_row_downsample_factor); - } -} - -static INLINE void calc_proj_params_r0_r1_neon( - const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, - int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { - assert(width % 8 == 0); - const int size = width * height; - - int64x2_t h00_lo = vdupq_n_s64(0); - int64x2_t h00_hi = vdupq_n_s64(0); - int64x2_t h11_lo = vdupq_n_s64(0); - int64x2_t h11_hi = vdupq_n_s64(0); - int64x2_t h01_lo = vdupq_n_s64(0); - int64x2_t h01_hi = vdupq_n_s64(0); - int64x2_t c0_lo = vdupq_n_s64(0); - int64x2_t c0_hi = vdupq_n_s64(0); - int64x2_t c1_lo = vdupq_n_s64(0); - int64x2_t c1_hi = vdupq_n_s64(0); - - do { - const uint8_t *src_ptr = src8; - const uint8_t *dat_ptr = dat8; - int32_t *flt0_ptr = flt0; - int32_t *flt1_ptr = flt1; - int w = width; - - do { - uint8x8_t s = vld1_u8(src_ptr); - uint8x8_t d = vld1_u8(dat_ptr); - int32x4_t f0_lo = vld1q_s32(flt0_ptr); - int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); - int32x4_t f1_lo = vld1q_s32(flt1_ptr); - int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); - - int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); - int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); - - int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); - int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); - f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); - f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); - f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); - f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); - - h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); - h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); - h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); - h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); - - h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); - h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); - h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); - h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); - - h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); - h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); - h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); - h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); - - c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); - c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); - c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); - c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); - - c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); - c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); - c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); - c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); - - src_ptr += 8; - dat_ptr += 8; - flt0_ptr += 8; - flt1_ptr += 8; - w -= 8; - } while (w != 0); - - src8 += src_stride; - dat8 += dat_stride; - flt0 += flt0_stride; - flt1 += flt1_stride; - } while (--height != 0); - - H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; - H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; - H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; - H[1][0] = H[0][1]; - C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; - C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; -} - -static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width, - int height, int src_stride, - const uint8_t *dat8, int dat_stride, - int32_t *flt0, int flt0_stride, - int64_t H[2][2], int64_t C[2]) { - assert(width % 8 == 0); - const int size = width * height; - - int64x2_t h00_lo = vdupq_n_s64(0); - int64x2_t h00_hi = vdupq_n_s64(0); - int64x2_t c0_lo = vdupq_n_s64(0); - int64x2_t c0_hi = vdupq_n_s64(0); - - do { - const uint8_t *src_ptr = src8; - const uint8_t *dat_ptr = dat8; - int32_t *flt0_ptr = flt0; - int w = width; - - do { - uint8x8_t s = vld1_u8(src_ptr); - uint8x8_t d = vld1_u8(dat_ptr); - int32x4_t f0_lo = vld1q_s32(flt0_ptr); - int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); - - int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); - int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); - - int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); - int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); - f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); - f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); - - h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); - h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); - h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); - h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); - - c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); - c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); - c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); - c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); - - src_ptr += 8; - dat_ptr += 8; - flt0_ptr += 8; - w -= 8; - } while (w != 0); - - src8 += src_stride; - dat8 += dat_stride; - flt0 += flt0_stride; - } while (--height != 0); - - H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; - C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; -} - -static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width, - int height, int src_stride, - const uint8_t *dat8, int dat_stride, - int32_t *flt1, int flt1_stride, - int64_t H[2][2], int64_t C[2]) { - assert(width % 8 == 0); - const int size = width * height; - - int64x2_t h11_lo = vdupq_n_s64(0); - int64x2_t h11_hi = vdupq_n_s64(0); - int64x2_t c1_lo = vdupq_n_s64(0); - int64x2_t c1_hi = vdupq_n_s64(0); - - do { - const uint8_t *src_ptr = src8; - const uint8_t *dat_ptr = dat8; - int32_t *flt1_ptr = flt1; - int w = width; - - do { - uint8x8_t s = vld1_u8(src_ptr); - uint8x8_t d = vld1_u8(dat_ptr); - int32x4_t f1_lo = vld1q_s32(flt1_ptr); - int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); - - int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); - int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); - - int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); - int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); - f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); - f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); - - h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); - h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); - h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); - h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); - - c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); - c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); - c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); - c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); - - src_ptr += 8; - dat_ptr += 8; - flt1_ptr += 8; - w -= 8; - } while (w != 0); - - src8 += src_stride; - dat8 += dat_stride; - flt1 += flt1_stride; - } while (--height != 0); - - H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; - C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; -} - -// The function calls 3 subfunctions for the following cases : -// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements -// of C and H need to be computed. -// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are -// non-zero and need to be computed. -// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are -// non-zero and need to be computed. -void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, - int src_stride, const uint8_t *dat8, - int dat_stride, int32_t *flt0, int flt0_stride, - int32_t *flt1, int flt1_stride, int64_t H[2][2], - int64_t C[2], const sgr_params_type *params) { - if ((params->r[0] > 0) && (params->r[1] > 0)) { - calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, - dat_stride, flt0, flt0_stride, flt1, - flt1_stride, H, C); - } else if (params->r[0] > 0) { - calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride, - flt0, flt0_stride, H, C); - } else if (params->r[1] > 0) { - calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride, - flt1, flt1_stride, H, C); - } -}
diff --git a/av1/encoder/arm/neon/pickrst_neon.h b/av1/encoder/arm/neon/pickrst_neon.h deleted file mode 100644 index d9a9ad4..0000000 --- a/av1/encoder/arm/neon/pickrst_neon.h +++ /dev/null
@@ -1,281 +0,0 @@ -/* - * Copyright (c) 2023, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ -#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ - -#include <arm_neon.h> - -#include "aom_dsp/arm/sum_neon.h" -#include "aom_dsp/arm/transpose_neon.h" - -// When we load 8 values of int16_t type and need less than 8 values for -// processing, the below mask is used to make the extra values zero. -extern const int16_t av1_neon_mask_16bit[16]; - -static INLINE void copy_upper_triangle(int64_t *H, const int wiener_win2) { - for (int i = 0; i < wiener_win2 - 2; i = i + 2) { - // Transpose the first 2x2 square. It needs a special case as the element - // of the bottom left is on the diagonal. - int64x2_t row0 = vld1q_s64(H + i * wiener_win2 + i + 1); - int64x2_t row1 = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1); - - int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); - - vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); - vst1q_s64(H + (i + 2) * wiener_win2 + i, tr_row); - - // Transpose and store all the remaining 2x2 squares of the line. - for (int j = i + 3; j < wiener_win2; j = j + 2) { - row0 = vld1q_s64(H + i * wiener_win2 + j); - row1 = vld1q_s64(H + (i + 1) * wiener_win2 + j); - - int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); - int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); - - vst1q_s64(H + j * wiener_win2 + i, tr_row0); - vst1q_s64(H + (j + 1) * wiener_win2 + i, tr_row1); - } - } -} - -static INLINE void transpose_M_win5(int64_t *M, int64_t *M_trn, - const int wiener_win) { - // 1st and 2nd rows. - int64x2_t row00 = vld1q_s64(M_trn); - int64x2_t row10 = vld1q_s64(M_trn + wiener_win); - vst1q_s64(M, aom_vtrn1q_s64(row00, row10)); - vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10)); - - int64x2_t row02 = vld1q_s64(M_trn + 2); - int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2); - vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12)); - vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12)); - - // Last column only needs trn2. - int64x2_t row03 = vld1q_s64(M_trn + 3); - int64x2_t row13 = vld1q_s64(M_trn + wiener_win + 3); - vst1q_s64(M + 4 * wiener_win, aom_vtrn2q_s64(row03, row13)); - - // 3rd and 4th rows. - int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win); - int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win); - vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30)); - vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30)); - - int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2); - int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2); - vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32)); - vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32)); - - // Last column only needs trn2. - int64x2_t row23 = vld1q_s64(M_trn + 2 * wiener_win + 3); - int64x2_t row33 = vld1q_s64(M_trn + 3 * wiener_win + 3); - vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn2q_s64(row23, row33)); - - // Last row. - int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win); - vst1_s64(M + 4, vget_low_s64(row40)); - vst1_s64(M + 1 * wiener_win + 4, vget_high_s64(row40)); - - int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2); - vst1_s64(M + 2 * wiener_win + 4, vget_low_s64(row42)); - vst1_s64(M + 3 * wiener_win + 4, vget_high_s64(row42)); - - // Element on the bottom right of M_trn is copied as is. - vst1_s64(M + 4 * wiener_win + 4, vld1_s64(M_trn + 4 * wiener_win + 4)); -} - -static INLINE void transpose_M_win7(int64_t *M, int64_t *M_trn, - const int wiener_win) { - // 1st and 2nd rows. - int64x2_t row00 = vld1q_s64(M_trn); - int64x2_t row10 = vld1q_s64(M_trn + wiener_win); - vst1q_s64(M, aom_vtrn1q_s64(row00, row10)); - vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10)); - - int64x2_t row02 = vld1q_s64(M_trn + 2); - int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2); - vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12)); - vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12)); - - int64x2_t row04 = vld1q_s64(M_trn + 4); - int64x2_t row14 = vld1q_s64(M_trn + wiener_win + 4); - vst1q_s64(M + 4 * wiener_win, aom_vtrn1q_s64(row04, row14)); - vst1q_s64(M + 5 * wiener_win, aom_vtrn2q_s64(row04, row14)); - - // Last column only needs trn2. - int64x2_t row05 = vld1q_s64(M_trn + 5); - int64x2_t row15 = vld1q_s64(M_trn + wiener_win + 5); - vst1q_s64(M + 6 * wiener_win, aom_vtrn2q_s64(row05, row15)); - - // 3rd and 4th rows. - int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win); - int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win); - vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30)); - vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30)); - - int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2); - int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2); - vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32)); - vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32)); - - int64x2_t row24 = vld1q_s64(M_trn + 2 * wiener_win + 4); - int64x2_t row34 = vld1q_s64(M_trn + 3 * wiener_win + 4); - vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn1q_s64(row24, row34)); - vst1q_s64(M + 5 * wiener_win + 2, aom_vtrn2q_s64(row24, row34)); - - // Last column only needs trn2. - int64x2_t row25 = vld1q_s64(M_trn + 2 * wiener_win + 5); - int64x2_t row35 = vld1q_s64(M_trn + 3 * wiener_win + 5); - vst1q_s64(M + 6 * wiener_win + 2, aom_vtrn2q_s64(row25, row35)); - - // 5th and 6th rows. - int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win); - int64x2_t row50 = vld1q_s64(M_trn + 5 * wiener_win); - vst1q_s64(M + 4, aom_vtrn1q_s64(row40, row50)); - vst1q_s64(M + wiener_win + 4, aom_vtrn2q_s64(row40, row50)); - - int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2); - int64x2_t row52 = vld1q_s64(M_trn + 5 * wiener_win + 2); - vst1q_s64(M + 2 * wiener_win + 4, aom_vtrn1q_s64(row42, row52)); - vst1q_s64(M + 3 * wiener_win + 4, aom_vtrn2q_s64(row42, row52)); - - int64x2_t row44 = vld1q_s64(M_trn + 4 * wiener_win + 4); - int64x2_t row54 = vld1q_s64(M_trn + 5 * wiener_win + 4); - vst1q_s64(M + 4 * wiener_win + 4, aom_vtrn1q_s64(row44, row54)); - vst1q_s64(M + 5 * wiener_win + 4, aom_vtrn2q_s64(row44, row54)); - - // Last column only needs trn2. - int64x2_t row45 = vld1q_s64(M_trn + 4 * wiener_win + 5); - int64x2_t row55 = vld1q_s64(M_trn + 5 * wiener_win + 5); - vst1q_s64(M + 6 * wiener_win + 4, aom_vtrn2q_s64(row45, row55)); - - // Last row. - int64x2_t row60 = vld1q_s64(M_trn + 6 * wiener_win); - vst1_s64(M + 6, vget_low_s64(row60)); - vst1_s64(M + 1 * wiener_win + 6, vget_high_s64(row60)); - - int64x2_t row62 = vld1q_s64(M_trn + 6 * wiener_win + 2); - vst1_s64(M + 2 * wiener_win + 6, vget_low_s64(row62)); - vst1_s64(M + 3 * wiener_win + 6, vget_high_s64(row62)); - - int64x2_t row64 = vld1q_s64(M_trn + 6 * wiener_win + 4); - vst1_s64(M + 4 * wiener_win + 6, vget_low_s64(row64)); - vst1_s64(M + 5 * wiener_win + 6, vget_high_s64(row64)); - - // Element on the bottom right of M_trn is copied as is. - vst1_s64(M + 6 * wiener_win + 6, vld1_s64(M_trn + 6 * wiener_win + 6)); -} - -static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd0, - int16x8_t dgd1, int64_t *M, - const int wiener_win, int row) { - int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0); - int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2); - - int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0)); - m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0)); - - int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1); - int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01)); - m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01)); - - m0 = horizontal_add_2d_s32(m0, m1); - m_01 = vpadalq_s32(m_01, m0); - vst1q_s64(M + row * wiener_win + 0, m_01); - - int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2); - int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02)); - m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02)); - - int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3); - int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03)); - m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03)); - - m2 = horizontal_add_2d_s32(m2, m3); - m_23 = vpadalq_s32(m_23, m2); - vst1q_s64(M + row * wiener_win + 2, m_23); - - int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4); - int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04)); - m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04)); - M[row * wiener_win + 4] += horizontal_long_add_s32x4(m4); -} - -static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd0, - int16x8_t dgd1, int64_t *M, - const int wiener_win, int row) { - int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0); - int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2); - int64x2_t m_45 = vld1q_s64(M + row * wiener_win + 4); - - int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0)); - m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0)); - - int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1); - int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01)); - m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01)); - - m0 = horizontal_add_2d_s32(m0, m1); - m_01 = vpadalq_s32(m_01, m0); - vst1q_s64(M + row * wiener_win + 0, m_01); - - int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2); - int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02)); - m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02)); - - int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3); - int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03)); - m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03)); - - m2 = horizontal_add_2d_s32(m2, m3); - m_23 = vpadalq_s32(m_23, m2); - vst1q_s64(M + row * wiener_win + 2, m_23); - - int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4); - int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04)); - m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04)); - - int16x8_t dgd05 = vextq_s16(dgd0, dgd1, 5); - int32x4_t m5 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd05)); - m5 = vmlal_s16(m5, vget_high_s16(src), vget_high_s16(dgd05)); - - m4 = horizontal_add_2d_s32(m4, m5); - m_45 = vpadalq_s32(m_45, m4); - vst1q_s64(M + row * wiener_win + 4, m_45); - - int16x8_t dgd06 = vextq_s16(dgd0, dgd1, 6); - int32x4_t m6 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd06)); - m6 = vmlal_s16(m6, vget_high_s16(src), vget_high_s16(dgd06)); - M[row * wiener_win + 6] += horizontal_long_add_s32x4(m6); -} - -static INLINE void compute_H_two_cols(int16x8_t *dgd0, int16x8_t *dgd1, - int col0, int col1, int64_t *H, - const int wiener_win, - const int wiener_win2) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = 0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col0 * wiener_win + row0) * wiener_win2 + (col1 * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd0[row0]), vget_low_s16(dgd1[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd0[row0]), - vget_high_s16(dgd1[row1])); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } -} - -#endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c new file mode 100644 index 0000000..85b980c --- /dev/null +++ b/av1/encoder/arm/pickrst_neon.c
@@ -0,0 +1,1217 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/arm/pickrst_neon.h" +#include "av1/encoder/pickrst.h" + +int64_t av1_lowbd_pixel_proj_error_neon( + const uint8_t *src, int width, int height, int src_stride, + const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); + + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0); + + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d)); + v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = (dat[k] << SGRPROJ_RST_BITS); + int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) + + xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += e * e; + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x2_t xq_v = vdup_n_s32(xq_active); + + do { + int32x4_t sse_s32 = vdupq_n_s32(0); + int j = 0; + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt_0 = vld1q_s32(&flt[j]); + int32x4_t flt_1 = vld1q_s32(&flt[j + 4]); + int16x8_t d_s16 = + vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + + int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16)); + int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16)); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = dat[k] << SGRPROJ_RST_BITS; + int32_t v = xq_active * (flt[k] - u); + int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) + + dat[k] - src[k]; + sse += e * e; + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } while (--height != 0); + } else { + uint32x4_t sse_s32 = vdupq_n_u32(0); + + do { + int j = 0; + + do { + const uint8x16_t d = vld1q_u8(&dat[j]); + const uint8x16_t s = vld1q_u8(&src[j]); + + uint8x16_t diff = vabdq_u8(d, s); + uint8x8_t diff_lo = vget_low_u8(diff); + uint8x8_t diff_hi = vget_high_u8(diff); + + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo)); + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi)); + + j += 16; + } while (j <= width - 16); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); + + sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32)); + } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +} + +// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are +// processing 2 pixels at a time, so the accumulator max can be as high as 32768 +// for the compute stats. +#define STAT_ACCUMULATOR_MAX 32768 + +static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vtbl4_u8(table, idx); +#endif +} + +static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2q_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx))); +#endif +} + +// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the +// computation. This function computes the final M from the accumulated +// (src_s64) and the residual parts (src_s32). It also transposes the result as +// the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to +// speed-up the computation. This function computes the final H from the +// accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, int stride, + int scale) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-triangle indices, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } + } + } +} + +// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win7_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = { + 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, + 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, + 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, + 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, + 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, + 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + dgd += 2; + + // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49 + // consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8)); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4); + + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8)); + dgd_avg1[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8)); + + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5); + + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + dgd_avg1[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8)); + dgd_avg1[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8)); + + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7 + // matrix tightly packed into a int16x8_t[6] array. This array contains + // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor); +} + +// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win5_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = { + 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, + 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25, + 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + dgd += 2; + + // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25 + // consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1); + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2)); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + downsample_factor); +} + +static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride, + int width, int height) { + uint64_t sum = 0; + + if (width >= 16) { + int h = 0; + // We can accumulate up to 257 8-bit values in a 16-bit value, given + // that each 16-bit vector has 8 elements, that means we can process up to + // int(257*8/width) rows before we need to widen to 32-bit vector + // elements. + int h_overflow = 257 * 8 / width; + int h_limit = height > h_overflow ? h_overflow : height; + uint32x4_t avg_u32 = vdupq_n_u32(0); + do { + uint16x8_t avg_u16 = vdupq_n_u16(0); + do { + int j = width; + const uint8_t *src_ptr = src; + do { + uint8x16_t s = vld1q_u8(src_ptr); + avg_u16 = vpadalq_u8(avg_u16, s); + j -= 16; + src_ptr += 16; + } while (j >= 16); + if (j >= 8) { + uint8x8_t s = vld1_u8(src_ptr); + avg_u16 = vaddw_u8(avg_u16, s); + j -= 8; + src_ptr += 8; + } + // Scalar tail case. + while (j > 0) { + sum += src[width - j]; + j--; + } + src += src_stride; + } while (++h < h_limit); + avg_u32 = vpadalq_u16(avg_u32, avg_u16); + + h_limit += h_overflow; + h_limit = height > h_overflow ? h_overflow : height; + } while (h < height); + return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) / + (width * height)); + } + if (width >= 8) { + int h = 0; + // We can accumulate up to 257 8-bit values in a 16-bit value, given + // that each 16-bit vector has 4 elements, that means we can process up to + // int(257*4/width) rows before we need to widen to 32-bit vector + // elements. + int h_overflow = 257 * 4 / width; + int h_limit = height > h_overflow ? h_overflow : height; + uint32x2_t avg_u32 = vdup_n_u32(0); + do { + uint16x4_t avg_u16 = vdup_n_u16(0); + do { + int j = width; + const uint8_t *src_ptr = src; + uint8x8_t s = vld1_u8(src_ptr); + avg_u16 = vpadal_u8(avg_u16, s); + j -= 8; + src_ptr += 8; + // Scalar tail case. + while (j > 0) { + sum += src[width - j]; + j--; + } + src += src_stride; + } while (++h < h_limit); + avg_u32 = vpadal_u16(avg_u32, avg_u16); + + h_limit += h_overflow; + h_limit = height > h_overflow ? h_overflow : height; + } while (h < height); + return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) / + (width * height)); + } + int i = height; + do { + int j = 0; + do { + sum += src[j]; + } while (++j < width); + src += src_stride; + } while (--i != 0); + return (uint8_t)(sum / (width * height)); +} + +void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); + (void)dgd_avg; + (void)src_avg; + + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int width = h_end - h_start; + const int height = v_end - v_start; + + const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint8_t *src_start = src + h_start + v_start * src_stride; + + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + + uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); + + // Since the height is not necessarily a multiple of the downsample factor, + // the last line of src will be scaled according to how many rows remain. + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + int downsampled_height = height / downsample_factor; + int downsample_remainder = height % downsample_factor; + + memset(M, 0, wiener_win2 * sizeof(*M)); + memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H)); + + // Calculate the M and H matrices for the normal and downsampled cases. + if (downsampled_height > 0) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } else { + compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } + } + + // Accumulate the remaining last rows in the downsampled case. + if (downsample_remainder > 0) { + int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } else { + compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } + } +} + +static INLINE void calc_proj_params_r0_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t h01_lo = vdupq_n_s64(0); + int64x2_t h01_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt0_ptr = flt0; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); + f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); + f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); + f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); + h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); + h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); + h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + H[1][0] = H[0][1]; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt0_ptr = flt0; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); + f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt0 += flt0_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; +} + +static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); + f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], + int64_t C[2], const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +}
diff --git a/av1/encoder/arm/pickrst_neon.h b/av1/encoder/arm/pickrst_neon.h new file mode 100644 index 0000000..f968384 --- /dev/null +++ b/av1/encoder/arm/pickrst_neon.h
@@ -0,0 +1,188 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_ +#define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_ + +#include <arm_neon.h> + +#include "av1/common/restoration.h" + +// Aligned sizes for Wiener filters. +#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2) +#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3) +#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED)) +#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2) +#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3) + +// Compute 8 values of M (cross correlation) for a single source pixel and +// accumulate. +static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg, + int16x8_t dgd_avg) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +// Compute 8 values of M (cross correlation) for two source pixels and +// accumulate. +static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0, + int16x4_t src_avg1, int16x8_t dgd_avg0, + int16x8_t dgd_avg1) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0); + lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg, + int width, int height) { + for (int i = 0; i < height; i += 4) { + int16x4_t di = vld1_s16(dgd_avg + i); + + for (int j = i; j < width; j += 4) { + int16x4_t dj = vld1_s16(dgd_avg + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j); + + h0 = vmlal_lane_s16(h0, dj, di, 0); + h1 = vmlal_lane_s16(h1, dj, di, 1); + h2 = vmlal_lane_s16(h2, dj, di, 2); + h3 = vmlal_lane_s16(h3, dj, di, 3); + + vst1q_s32(H_s32 + 0 * width + j, h0); + vst1q_s32(H_s32 + 1 * width + j, h1); + vst1q_s32(H_s32 + 2 * width + j, h2); + vst1q_s32(H_s32 + 3 * width + j, h3); + } + H_s32 += 4 * width; + } +} + +static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 24; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2; + } +} + +static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 48; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i); + + h0 = vmlal_lane_s16(h0, di0, di0, 0); + h0 = vmlal_lane_s16(h0, di1, di1, 0); + h1 = vmlal_lane_s16(h1, di0, di0, 1); + h1 = vmlal_lane_s16(h1, di1, di1, 1); + h2 = vmlal_lane_s16(h2, di0, di0, 2); + h2 = vmlal_lane_s16(h2, di1, di1, 2); + h3 = vmlal_lane_s16(h3, di0, di0, 3); + h3 = vmlal_lane_s16(h3, di1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3); + + for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j); + h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j); + h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j); + h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_ALIGN2; + } +} + +// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data. +static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src, + int length) { + do { + int32x4_t s32 = vld1q_s32(src); + vst1q_s32(src, vdupq_n_s32(0)); + src += 4; + + int64x2_t d_lo = vld1q_s64(dst + 0); + int64x2_t d_hi = vld1q_s64(dst + 2); + + d_lo = vaddw_s32(d_lo, vget_low_s32(s32)); + d_hi = vaddw_s32(d_hi, vget_high_s32(s32)); + + vst1q_s64(dst + 0, d_lo); + vst1q_s64(dst + 2, d_hi); + + dst += 4; + length -= 4; + } while (length > 0); +} + +#endif // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
diff --git a/av1/encoder/arm/pickrst_sve.c b/av1/encoder/arm/pickrst_sve.c new file mode 100644 index 0000000..5d7370b --- /dev/null +++ b/av1/encoder/arm/pickrst_sve.c
@@ -0,0 +1,465 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <assert.h> +#include <string.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/arm/pickrst_sve.h" + +static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride, + int width, int height) { + uint32x4_t avg_u32 = vdupq_n_u32(0); + uint8x16_t ones = vdupq_n_u8(1); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b8_u32(0, width % 16); + + int h = height; + do { + int j = width; + const uint8_t *src_ptr = src; + while (j >= 16) { + uint8x16_t s = vld1q_u8(src_ptr); + avg_u32 = vdotq_u32(avg_u32, s, ones); + + j -= 16; + src_ptr += 16; + } + uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr)); + avg_u32 = vdotq_u32(avg_u32, s_end, ones); + + src += src_stride; + } while (--h != 0); + return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height)); +} + +static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, + int16_t *buf_avg, int buf_avg_stride, + int width, int height, + int downsample_factor) { + uint8x8_t avg_u8 = vdup_n_u8(avg); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b8_u32(0, width % 8); + + uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg))); + + do { + int j = width; + const uint8_t *buf_ptr = buf; + int16_t *buf_avg_ptr = buf_avg; + while (j >= 8) { + uint8x8_t d = vld1_u8(buf_ptr); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); + + j -= 8; + buf_ptr += 8; + buf_avg_ptr += 8; + } + uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr))); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end))); + + buf += buf_stride; + buf_avg += buf_avg_stride; + height -= downsample_factor; + } while (height > 0); +} + +static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp, + const int wiener_win2, const int scale) { + for (int i = 0; i < wiener_win2 - 2; i = i + 2) { + // Transpose the first 2x2 square. It needs a special case as the element + // of the bottom left is on the diagonal. + int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1); + int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1); + + int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); + + vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); + vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row); + + // Transpose and store all the remaining 2x2 squares of the line. + for (int j = i + 3; j < wiener_win2; j = j + 2) { + row0 = vld1q_s64(H_tmp + i * wiener_win2 + j); + row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j); + + int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); + int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); + + vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0); + vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1); + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; i++) { + H[i] += H_tmp[i] * scale; + } +} + +// Transpose the matrix that has just been computed and accumulate it in M. +static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn, + const int wiener_win, int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *M++ += (int64_t)(M_trn[tr_idx] * scale); + } + } +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 7 * 7. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 49 * 49. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride, + int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, + int64_t *H, int downsample_factor) { + const int wiener_win = 7; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[49]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[49 * 49]; + memset(H_tmp, 0, sizeof(H_tmp)); + + assert(height > 0); + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[7]; + load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win7(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j <= width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[7]; + load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + if (j < width) { + // Process remaining columns using a predicate to discard excess elements. + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[7]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + dgd0[5] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0)); + dgd0[6] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + } + dgd_avg += downsample_factor * dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 7, downsample_factor); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 5 * 5. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 25 * 25. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride, + int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, + int64_t *H, int downsample_factor) { + const int wiener_win = 5; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[25]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[25 * 25]; + memset(H_tmp, 0, sizeof(H_tmp)); + + assert(height > 0); + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[5]; + load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win5(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j <= width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[5]; + load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4]); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + // Process remaining columns using a predicate to discard excess elements. + if (j < width) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[5]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + } + dgd_avg += downsample_factor * dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 5, downsample_factor); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); +} + +void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + memset(M, 0, sizeof(*M) * wiener_win * wiener_win); + + const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); + const int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + // dgd_avg and src_avg have been memset to zero before calling this + // function, so round up the stride to the next multiple of 8 so that we + // don't have to worry about a tail loop when computing M. + const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; + const int src_avg_stride = (width & ~7) + 8; + + // Compute (dgd - avg) and store it in dgd_avg. + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, + width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); + + // Compute (src - avg), downsample if necessary and store in src-avg. + const uint8_t *src_start = src + h_start + v_start * src_stride; + compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, + src_avg_stride, width, height, downsample_factor); + + const int downsample_height = height / downsample_factor; + + // Since the height is not necessarily a multiple of the downsample factor, + // the last line of src will be scaled according to how many rows remain. + const int downsample_remainder = height % downsample_factor; + + if (downsample_height > 0) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, + width, downsample_height, M, H, downsample_factor); + } else { + compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, + width, downsample_height, M, H, downsample_factor); + } + } + + if (downsample_remainder > 0) { + const int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_sve( + dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, + src_avg + downsample_height * src_avg_stride, src_avg_stride, width, + 1, M, H, downsample_remainder); + } else { + compute_stats_win5_sve( + dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, + src_avg + downsample_height * src_avg_stride, src_avg_stride, width, + 1, M, H, downsample_remainder); + } + } +}
diff --git a/av1/encoder/arm/pickrst_sve.h b/av1/encoder/arm/pickrst_sve.h new file mode 100644 index 0000000..97f08fc --- /dev/null +++ b/av1/encoder/arm/pickrst_sve.h
@@ -0,0 +1,151 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ +#define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ + +#include <arm_neon.h> +#include <arm_sve.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" + +// Swap each half of the dgd vectors so that we can accumulate the result of +// the dot-products directly in the destination matrix. +static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) { + int16x8_t dgd_trn0 = vreinterpretq_s16_s64( + vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); + int16x8_t dgd_trn1 = vreinterpretq_s16_s64( + vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); + + return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 }; +} + +static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5], + int64_t *M, int row) { + const int wiener_win = 5; + + int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); + int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); + + int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); + cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 0, cross_corr01); + + int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); + + int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); + cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 2, cross_corr23); + + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]); + M[row * wiener_win + 4] += vaddvq_s64(m4); +} + +static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7], + int64_t *M, int row) { + const int wiener_win = 7; + + int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); + int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); + + int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); + cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 0, cross_corr01); + + int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); + + int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); + cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 2, cross_corr23); + + int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4); + int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]); + + int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0); + cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 4, cross_corr45); + + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]); + M[row * wiener_win + 6] += vaddvq_s64(m6); +} + +static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, + const int wiener_win, + const int wiener_win2) { + for (int row0 = 0; row0 < wiener_win; row0++) { + for (int row1 = row0; row1 < wiener_win; row1++) { + int auto_cov_idx = + (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; + + int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]); + H[auto_cov_idx] += vaddvq_s64(auto_cov); + } + } +} + +static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1, + int row0, int row1, int64_t *H) { + for (int col0 = 0; col0 < 5; col0++) { + int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5); + + int64x2_t h01 = vld1q_s64(H + auto_cov_idx); + int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); + + int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); + auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx, auto_cov01); + + int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); + + int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); + auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 2, auto_cov23); + + int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]); + H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4); + } +} + +static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1, + int row0, int row1, int64_t *H) { + for (int col0 = 0; col0 < 7; col0++) { + int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7); + + int64x2_t h01 = vld1q_s64(H + auto_cov_idx); + int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); + + int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); + auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx, auto_cov01); + + int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); + + int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); + auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 2, auto_cov23); + + int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4); + int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]); + + int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0); + auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 4, auto_cov45); + + int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]); + H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6); + } +} + +#endif // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/quantize_neon.c similarity index 100% rename from av1/encoder/arm/neon/quantize_neon.c rename to av1/encoder/arm/quantize_neon.c
diff --git a/av1/encoder/arm/neon/rdopt_neon.c b/av1/encoder/arm/rdopt_neon.c similarity index 100% rename from av1/encoder/arm/neon/rdopt_neon.c rename to av1/encoder/arm/rdopt_neon.c
diff --git a/av1/encoder/arm/neon/reconinter_enc_neon.c b/av1/encoder/arm/reconinter_enc_neon.c similarity index 98% rename from av1/encoder/arm/neon/reconinter_enc_neon.c rename to av1/encoder/arm/reconinter_enc_neon.c index 03afa30..3d17723 100644 --- a/av1/encoder/arm/neon/reconinter_enc_neon.c +++ b/av1/encoder/arm/reconinter_enc_neon.c
@@ -222,8 +222,7 @@ int i = height / 2; do { uint16x4_t r = load_u16_2x2(ref, ref_stride); - store_u16_2x1(comp_pred + 0 * width, r, 0); - store_u16_2x1(comp_pred + 1 * width, r, 1); + store_u16x2_strided_x2(comp_pred, width, r); ref += 2 * ref_stride; comp_pred += 2 * width; } while (--i != 0);
diff --git a/av1/encoder/arm/neon/shift_neon.h b/av1/encoder/arm/shift_neon.h similarity index 93% rename from av1/encoder/arm/neon/shift_neon.h rename to av1/encoder/arm/shift_neon.h index d73aef2..ad9fd9c 100644 --- a/av1/encoder/arm/neon/shift_neon.h +++ b/av1/encoder/arm/shift_neon.h
@@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ -#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ +#ifndef AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_ +#define AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_ #include <arm_neon.h> @@ -46,4 +46,4 @@ #undef SHIFT_LOOP_HELPER -#endif // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ +#endif // AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_
diff --git a/av1/encoder/arm/neon/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c similarity index 100% rename from av1/encoder/arm/neon/temporal_filter_neon.c rename to av1/encoder/arm/temporal_filter_neon.c
diff --git a/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c similarity index 87% rename from av1/encoder/arm/neon/temporal_filter_neon_dotprod.c rename to av1/encoder/arm/temporal_filter_neon_dotprod.c index 5a52e70..919521f 100644 --- a/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c +++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -23,7 +23,15 @@ #define SSE_STRIDE (BW + 4) // clang-format off +// Table used to pad the first and last columns and apply the sliding window. +DECLARE_ALIGNED(16, static const uint8_t, kLoadPad[4][16]) = { + { 2, 2, 2, 3, 4, 255, 255, 255, 255, 2, 2, 3, 4, 5, 255, 255 }, + { 255, 255, 2, 3, 4, 5, 6, 255, 255, 255, 255, 3, 4, 5, 6, 7 }, + { 0, 1, 2, 3, 4, 255, 255, 255, 255, 1, 2, 3, 4, 5, 255, 255 }, + { 255, 255, 2, 3, 4, 5, 5, 255, 255, 255, 255, 3, 4, 5, 5, 5 } +}; +// For columns that don't need to be padded it's just a simple mask. DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, @@ -56,22 +64,6 @@ } while (++i < block_height); } -static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col, - const uint32_t block_width) { - uint8x8_t s = vld1_u8(src); - - if (col == 0) { - const uint8_t lane2 = vget_lane_u8(s, 2); - s = vset_lane_u8(lane2, s, 0); - s = vset_lane_u8(lane2, s, 1); - } else if (col >= block_width - 4) { - const uint8_t lane5 = vget_lane_u8(s, 5); - s = vset_lane_u8(lane5, s, 6); - s = vset_lane_u8(lane5, s, 7); - } - return vcombine_u8(s, s); -} - static void apply_temporal_filter( const uint8_t *frame, const unsigned int stride, const uint32_t block_width, const uint32_t block_height, const int *subblock_mses, @@ -84,6 +76,10 @@ uint32_t acc_5x5_neon[BH][BW]; const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask); + const uint8x16_t pad_tbl0 = vld1q_u8(kLoadPad[0]); + const uint8x16_t pad_tbl1 = vld1q_u8(kLoadPad[1]); + const uint8x16_t pad_tbl2 = vld1q_u8(kLoadPad[2]); + const uint8x16_t pad_tbl3 = vld1q_u8(kLoadPad[3]); // Traverse 4 columns at a time - first and last two columns need padding. for (uint32_t col = 0; col < block_width; col += 4) { @@ -92,9 +88,18 @@ // Load, pad (for first and last two columns) and mask 3 rows from the top. for (int i = 2; i < 5; i++) { - const uint8x16_t s = load_and_pad(src, col, block_width); - vsrc[i][0] = vandq_u8(s, vmask.val[0]); - vsrc[i][1] = vandq_u8(s, vmask.val[1]); + uint8x8_t s = vld1_u8(src); + uint8x16_t s_dup = vcombine_u8(s, s); + if (col == 0) { + vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl0); + vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl1); + } else if (col >= block_width - 4) { + vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl2); + vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl3); + } else { + vsrc[i][0] = vandq_u8(s_dup, vmask.val[0]); + vsrc[i][1] = vandq_u8(s_dup, vmask.val[1]); + } src += SSE_STRIDE; } @@ -130,9 +135,18 @@ if (row <= block_height - 4) { // Load next row into the bottom of the sliding window. - uint8x16_t s = load_and_pad(src, col, block_width); - vsrc[4][0] = vandq_u8(s, vmask.val[0]); - vsrc[4][1] = vandq_u8(s, vmask.val[1]); + uint8x8_t s = vld1_u8(src); + uint8x16_t s_dup = vcombine_u8(s, s); + if (col == 0) { + vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl0); + vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl1); + } else if (col >= block_width - 4) { + vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl2); + vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl3); + } else { + vsrc[4][0] = vandq_u8(s_dup, vmask.val[0]); + vsrc[4][1] = vandq_u8(s_dup, vmask.val[1]); + } src += SSE_STRIDE; } else { // Pad the bottom 2 rows.
diff --git a/av1/encoder/arm/neon/txfm_neon.h b/av1/encoder/arm/txfm_neon.h similarity index 86% rename from av1/encoder/arm/neon/txfm_neon.h rename to av1/encoder/arm/txfm_neon.h index 635364f..8b07dfb 100644 --- a/av1/encoder/arm/neon/txfm_neon.h +++ b/av1/encoder/arm/txfm_neon.h
@@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ -#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ +#ifndef AOM_AV1_ENCODER_ARM_TXFM_NEON_H_ +#define AOM_AV1_ENCODER_ARM_TXFM_NEON_H_ #include "aom/aom_integer.h" // For AOM_INLINE. @@ -23,4 +23,4 @@ } } -#endif // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ +#endif // AOM_AV1_ENCODER_ARM_TXFM_NEON_H_
diff --git a/av1/encoder/arm/neon/wedge_utils_neon.c b/av1/encoder/arm/wedge_utils_neon.c similarity index 100% rename from av1/encoder/arm/neon/wedge_utils_neon.c rename to av1/encoder/arm/wedge_utils_neon.c
diff --git a/av1/encoder/arm/wedge_utils_sve.c b/av1/encoder/arm/wedge_utils_sve.c new file mode 100644 index 0000000..521601a --- /dev/null +++ b/av1/encoder/arm/wedge_utils_sve.c
@@ -0,0 +1,92 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/reconinter.h" + +uint64_t av1_wedge_sse_from_residuals_sve(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + assert(N % 64 == 0); + + // Predicate pattern with first 8 elements true. + const svbool_t pattern = svptrue_pat_b16(SV_VL8); + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + int i = 0; + do { + int32x4_t sum[4]; + int16x8_t sum_s16[2]; + + const int16x8_t r1_l = vld1q_s16(r1 + i); + const int16x8_t r1_h = vld1q_s16(r1 + i + 8); + const int16x8_t d_l = vld1q_s16(d + i); + const int16x8_t d_h = vld1q_s16(d + i + 8); + + // Use a zero-extending load to widen the vector elements. + const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m + i)); + const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + i + 8)); + + sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); + sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); + + sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); + sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); + sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); + sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); + + sum_s16[0] = vcombine_s16(vqmovn_s32(sum[0]), vqmovn_s32(sum[1])); + sum_s16[1] = vcombine_s16(vqmovn_s32(sum[2]), vqmovn_s32(sum[3])); + + sse[0] = aom_sdotq_s16(sse[0], sum_s16[0], sum_s16[0]); + sse[1] = aom_sdotq_s16(sse[1], sum_s16[1], sum_s16[1]); + + i += 16; + } while (i < N); + + const uint64_t csse = + (uint64_t)horizontal_add_s64x2(vaddq_s64(sse[0], sse[1])); + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +int8_t av1_wedge_sign_from_residuals_sve(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + assert(N % 16 == 0); + + // Predicate pattern with first 8 elements true. + svbool_t pattern = svptrue_pat_b16(SV_VL8); + int64x2_t acc_l = vdupq_n_s64(0); + int64x2_t acc_h = vdupq_n_s64(0); + + do { + const int16x8_t ds_l = vld1q_s16(ds); + const int16x8_t ds_h = vld1q_s16(ds + 8); + + // Use a zero-extending load to widen the vector elements. + const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m)); + const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + 8)); + + acc_l = aom_sdotq_s16(acc_l, ds_l, m_l); + acc_h = aom_sdotq_s16(acc_h, ds_h, m_h); + + ds += 16; + m += 16; + N -= 16; + } while (N != 0); + + const int64x2_t sum = vaddq_s64(acc_l, acc_h); + return horizontal_add_s64x2(sum) > limit; +}
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c index 1aad473..110d17f 100644 --- a/av1/encoder/av1_quantize.c +++ b/av1/encoder/av1_quantize.c
@@ -15,6 +15,7 @@ #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" #include "aom_ports/mem.h" #include "av1/common/idct.h" @@ -581,7 +582,7 @@ uint32_t t; int l, m; t = d; - for (l = 0; t > 1; l++) t >>= 1; + l = get_msb(t); m = 1 + (1 << (16 + l)) / d; *quant = (int16_t)(m - (1 << 16)); *shift = 1 << (16 - l);
diff --git a/av1/encoder/av1_temporal_denoiser.c b/av1/encoder/av1_temporal_denoiser.c index 3012df6..d4a1625 100644 --- a/av1/encoder/av1_temporal_denoiser.c +++ b/av1/encoder/av1_temporal_denoiser.c
@@ -489,7 +489,7 @@ &denoiser->running_avg_y[fb_idx], cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; @@ -577,7 +577,7 @@ fail = aom_alloc_frame_buffer( &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border, - legacy_byte_alignment, 0, 0); + legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; @@ -589,7 +589,7 @@ fail = aom_alloc_frame_buffer( &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx, - ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0); + ssy, use_highbitdepth, border, legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; @@ -600,7 +600,7 @@ // layer. fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, use_highbitdepth, border, legacy_byte_alignment, - 0, 0); + false, 0); if (fail) { av1_denoiser_free(denoiser); return 1;
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c index a9e7978..163b62c 100644 --- a/av1/encoder/bitstream.c +++ b/av1/encoder/bitstream.c
@@ -26,6 +26,7 @@ #include "av1/common/cdef.h" #include "av1/common/cfl.h" +#include "av1/common/debugmodes.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" @@ -3391,8 +3392,8 @@ return AOM_CODEC_OK; } -size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size, - uint8_t *data) { +static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size, + uint8_t *data) { const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); const size_t move_dst_offset = length_field_size + obu_header_size; const size_t move_src_offset = obu_header_size; @@ -3521,9 +3522,6 @@ return size; } -extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size, - const char *filename); - typedef struct { uint32_t tg_hdr_size; uint32_t frame_header_size; @@ -3581,7 +3579,7 @@ *total_size += lst_obu->tg_hdr_size; const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size; const size_t length_field_size = - av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst); + obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst); if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) != AOM_CODEC_OK) assert(0); @@ -3642,7 +3640,9 @@ mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col); - aom_stop_encode(&mode_bc); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes"); + } tile_size = mode_bc.pos; buf->size = tile_size; @@ -3778,7 +3778,10 @@ // Pack tile data aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size); write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col); - aom_stop_encode(&mode_bc); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR, + "Error writing modes"); + } tile_size = mode_bc.pos; assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); @@ -3801,7 +3804,7 @@ const uint32_t obu_payload_size = (uint32_t)(*curr_tg_data_size) - obu_header_size; const size_t length_field_size = - av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start); + obu_memmove(obu_header_size, obu_payload_size, curr_tg_start); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, curr_tg_start) != AOM_CODEC_OK) { assert(0); @@ -4010,8 +4013,8 @@ // to pack the smaller bitstream of such frames. This function computes the // number of required number of workers based on setup time overhead and job // dispatch time overhead for given tiles and available workers. -int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles, - int avail_workers, bool pack_bs_mt_enabled) { +static int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles, + int avail_workers, bool pack_bs_mt_enabled) { if (!pack_bs_mt_enabled) return 1; uint64_t frame_abs_sum_level = 0; @@ -4136,8 +4139,7 @@ OBU_METADATA, 0, dst); obu_payload_size = av1_write_metadata_obu(current_metadata, dst + obu_header_size); - length_field_size = - av1_obu_memmove(obu_header_size, obu_payload_size, dst); + length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) == AOM_CODEC_OK) { const size_t obu_size = obu_header_size + obu_payload_size; @@ -4187,7 +4189,7 @@ obu_payload_size = av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size); const size_t length_field_size = - av1_obu_memmove(obu_header_size, obu_payload_size, data); + obu_memmove(obu_header_size, obu_payload_size, data); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; @@ -4212,7 +4214,7 @@ obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb, data + obu_header_size, 1); - length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data); + length_field = obu_memmove(obu_header_size, obu_payload_size, data); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != AOM_CODEC_OK) { return AOM_CODEC_ERROR;
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h index 12e8a63..d037039 100644 --- a/av1/encoder/bitstream.h +++ b/av1/encoder/bitstream.h
@@ -21,6 +21,7 @@ #include "av1/common/enums.h" #include "av1/encoder/level.h" #include "aom_dsp/bitwriter.h" +#include "aom_util/aom_pthread.h" struct aom_write_bit_buffer; struct AV1_COMP;
diff --git a/av1/encoder/block.h b/av1/encoder/block.h index 33d2d8c..9bee0b8 100644 --- a/av1/encoder/block.h +++ b/av1/encoder/block.h
@@ -1325,6 +1325,9 @@ //! Coding block distortion value for uv/color, minimum over the inter modes. int64_t min_dist_inter_uv; + //! Threshold on the number of colors for testing palette mode. + int color_palette_thresh; + //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane // so we can keep dqcoeff of the best tx_type. tran_low_t *dqcoeff_buf; @@ -1348,6 +1351,9 @@ //! Motion vector from superblock MV derived from int_pro_motion() in // the variance_partitioning. int_mv sb_me_mv; + //! Flag to indicate if a fixed partition should be used, only if the + // speed feature rt_sf->use_fast_fixed_part is enabled. + int sb_force_fixed_part; //! SSE of the current predictor. unsigned int pred_sse[REF_FRAMES]; //! Prediction for ML based partition.
diff --git a/av1/encoder/cnn.c b/av1/encoder/cnn.c index 28e1f71..b019ace 100644 --- a/av1/encoder/cnn.c +++ b/av1/encoder/cnn.c
@@ -31,13 +31,9 @@ int th_step; } CONVOLVE_OPS; -typedef float (*activation_fn)(float); +static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); } -static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); } - -static float relu(float x) { return (x < 0) ? 0 : x; } - -static float identity(float x) { return x; } +static INLINE float relu(float x) { return (x < 0) ? 0 : x; } typedef struct { int allocsize; @@ -142,14 +138,16 @@ return true; } -int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { +#ifndef NDEBUG +static int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { return (t1->width == t2->width && t1->height == t2->height); } -int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { +static int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { return (t1->channels == t2->channels && t1->width == t2->width && t1->height == t2->height); } +#endif // NDEBUG void av1_find_cnn_layer_output_size(int in_width, int in_height, const CNN_LAYER_CONFIG *layer_config, @@ -193,8 +191,8 @@ } } -void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, - int channels_per_branch[]) { +static void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, + int channels_per_branch[]) { int branch = layer_config->branch; const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { @@ -291,18 +289,6 @@ } } -activation_fn get_activation(ACTIVATION layer_activation) { - switch (layer_activation) { - case NONE: return identity; - case RELU: return relu; - case SOFTSIGN: return softsign; - case SIGMOID: - assert(0 && "Sigmoid has not been supported in CNN."); // TO DO - return NULL; - default: assert(0 && "Unknown activation type"); return NULL; - } -} - static INLINE int get_start_shift_convolve(int width, int filt_width, int stride) { const int mod = (width % stride); @@ -322,11 +308,22 @@ void av1_cnn_activate_c(float **output, int channels, int width, int height, int stride, ACTIVATION layer_activation) { - activation_fn activation = get_activation(layer_activation); - for (int c = 0; c < channels; ++c) { - for (int i = 0; i < height; ++i) - for (int j = 0; j < width; ++j) - output[c][i * stride + j] = activation(output[c][i * stride + j]); + if (layer_activation == RELU) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = relu(output[c][i * stride + j]); + } + } else if (layer_activation == SOFTSIGN) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = softsign(output[c][i * stride + j]); + } + } else if (layer_activation == SIGMOID) { + assert(0 && "Sigmoid has not been supported in CNN."); // TO DO + } else if (layer_activation != NONE) { + assert(0 && "Unknown activation type"); } } @@ -1013,10 +1010,9 @@ } // Non-linearity - if (layer_config->activation != IDENTITY) - av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, - tensor2[branch].width, tensor2[branch].height, - tensor2[branch].stride, layer_config->activation); + av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, layer_config->activation); if (layer_config->bn_params.bn_gamma) { av1_cnn_batchnorm(
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c index 7b8240d..aafe55d 100644 --- a/av1/encoder/context_tree.c +++ b/av1/encoder/context_tree.c
@@ -248,11 +248,11 @@ if (!keep_best && !keep_none) aom_free(pc_tree); } -void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { +int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { // The structure 'sms_tree' is used to store the simple motion search data for // partition pruning in inter frames. Hence, the memory allocations and // initializations related to it are avoided for allintra encoding mode. - if (cpi->oxcf.kf_cfg.key_freq_max == 0) return; + if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0; AV1_COMMON *const cm = &cpi->common; const int stat_generation_stage = is_stat_generation_stage(cpi); @@ -265,8 +265,9 @@ int nodes; aom_free(td->sms_tree); - CHECK_MEM_ERROR(cm, td->sms_tree, - aom_calloc(tree_nodes, sizeof(*td->sms_tree))); + td->sms_tree = + (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree)); + if (!td->sms_tree) return -1; this_sms = &td->sms_tree[0]; if (!stat_generation_stage) { @@ -301,6 +302,7 @@ // Set up the root node for the largest superblock size td->sms_root = &td->sms_tree[tree_nodes - 1]; + return 0; } void av1_free_sms_tree(ThreadData *td) {
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h index 78f2076..0be7ccb 100644 --- a/av1/encoder/context_tree.h +++ b/av1/encoder/context_tree.h
@@ -131,7 +131,8 @@ return tree_nodes; } -void av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); +// Returns 0 on success, -1 on memory allocation failure. +int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); void av1_free_sms_tree(struct ThreadData *td); #ifdef __cplusplus
diff --git a/av1/encoder/enc_enums.h b/av1/encoder/enc_enums.h index 20cefa1..0a8b0f2 100644 --- a/av1/encoder/enc_enums.h +++ b/av1/encoder/enc_enums.h
@@ -12,10 +12,14 @@ #ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ #define AOM_AV1_ENCODER_ENC_ENUMS_H_ +#include "aom_ports/mem.h" + #ifdef __cplusplus extern "C" { #endif +#define MAX_NUM_THREADS 64 + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. enum {
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c index 878cec5..db77dc0 100644 --- a/av1/encoder/encode_strategy.c +++ b/av1/encoder/encode_strategy.c
@@ -237,10 +237,24 @@ // Clear down mmx registers - if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id > 0) { - cpi->framerate = cpi->svc.base_framerate; - av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); - return; + if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config && + cpi->svc.number_spatial_layers > 1) { + // ts_start is the timestamp for the current frame and ts_end is the + // expected next timestamp given the duration passed into codec_encode(). + // See the setting in encoder_encode() in av1_cx_iface.c: + // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol), + // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + + // duration). So the difference ts_end - ts_start is the duration passed + // in by the user. For spatial layers SVC set the framerate based directly + // on the duration, and bypass the adjustments below. + this_duration = ts_end - ts_start; + if (this_duration > 0) { + cpi->new_framerate = 10000000.0 / this_duration; + av1_new_framerate(cpi, cpi->new_framerate); + time_stamps->prev_ts_start = ts_start; + time_stamps->prev_ts_end = ts_end; + return; + } } if (ts_start == time_stamps->first_ts_start) { @@ -698,20 +712,6 @@ } #if !CONFIG_REALTIME_ONLY -void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) { - AV1_COMMON *const cm = &cpi->common; - const int num_planes = av1_num_planes(cm); - MACROBLOCK *const x = &cpi->td.mb; - MACROBLOCKD *const xd = &x->e_mbd; - - av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size); - - av1_setup_block_planes(xd, cm->seq_params->subsampling_x, - cm->seq_params->subsampling_y, num_planes); - - set_mi_offsets(&cm->mi_params, xd, 0, 0); -} - // Apply temporal filtering to source frames and encode the filtered frame. // If the current frame does not require filtering, this function is identical // to av1_encode() except that tpl is not performed. @@ -805,7 +805,7 @@ oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, - NULL, cpi->image_pyramid_levels, 0); + NULL, cpi->alloc_pyramid, 0); if (ret) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate tf_buf_second_arf"); @@ -909,7 +909,7 @@ if (apply_filtering && is_psnr_calc_enabled(cpi)) { cpi->source = av1_realloc_and_scale_if_required( cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0, - false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + false, true, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); cpi->unscaled_source = source_buffer; } #if CONFIG_COLLECT_COMPONENT_TIMING @@ -1688,8 +1688,7 @@ // This is used in rtc temporal filter case. Use true source in the PSNR // calculation. - if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && - cpi->common.current_frame.frame_type != KEY_FRAME) { + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf) { assert(cpi->orig_source.buffer_alloc_sz > 0); cpi->source = &cpi->orig_source; } @@ -1744,9 +1743,9 @@ cpi->svc.temporal_layer_id == 0 && cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width && cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) { - aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0); - aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0); - aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0); + aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); + aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); + aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); } return AOM_CODEC_OK;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c index 2c6e49f..4c178b1 100644 --- a/av1/encoder/encodeframe.c +++ b/av1/encoder/encodeframe.c
@@ -23,7 +23,7 @@ #include "aom_dsp/binary_codes_writer.h" #include "aom_ports/mem.h" #include "aom_ports/aom_timer.h" - +#include "aom_util/aom_pthread.h" #if CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_MISMATCH_DEBUG @@ -535,11 +535,19 @@ } #endif // Set the partition - if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { + if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + (sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 && + (!frame_is_intra_only(cm) && + (!cpi->ppi->use_svc || + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) { // set a fixed-size partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); - const BLOCK_SIZE bsize = - seg_skip ? sb_size : sf->part_sf.fixed_partition_size; + BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; + if (sf->rt_sf.use_fast_fixed_part && + x->content_state_sb.source_sad_nonrd < kLowSad) { + bsize_select = BLOCK_64X64; + } + const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select; av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { // set a variance-based partition @@ -1048,8 +1056,13 @@ // The threshold is determined based on kLowSad and kHighSad threshold and // test results. - const uint64_t thresh_low = 15000; - const uint64_t thresh_high = 40000; + uint64_t thresh_low = 15000; + uint64_t thresh_high = 40000; + + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + thresh_low = thresh_low << 1; + thresh_high = thresh_high << 1; + } if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) { do_calc_src_content = false; @@ -1197,6 +1210,8 @@ x->sb_me_block = 0; x->sb_me_partition = 0; x->sb_me_mv.as_int = 0; + x->sb_force_fixed_part = 1; + x->color_palette_thresh = 64; if (cpi->oxcf.mode == ALLINTRA) { x->intra_sb_rdmult_modifier = 128; @@ -1225,7 +1240,7 @@ // Grade the temporal variation of the sb, the grade will be used to decide // fast mode search strategy for coding blocks - grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); + if (!seg_skip) grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); // encode the superblock if (use_nonrd_mode) { @@ -1267,17 +1282,32 @@ void av1_alloc_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; av1_row_mt_mem_dealloc(cpi); aom_free(cpi->tile_data); + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; + CHECK_MEM_ERROR( cm, cpi->tile_data, aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; + enc_row_mt->allocated_tile_cols = tile_cols; + enc_row_mt->allocated_tile_rows = tile_rows; + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + av1_zero(this_tile->row_mt_sync); + this_tile->row_ctx = NULL; + } + } } void av1_init_tile_data(AV1_COMP *cpi) { @@ -1568,20 +1598,12 @@ // High Latency: Turn off skip mode if all refs are fwd. if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0; - static const int flag_list[REF_FRAMES] = { 0, - AOM_LAST_FLAG, - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, - AOM_GOLD_FLAG, - AOM_BWD_FLAG, - AOM_ALT2_FLAG, - AOM_ALT_FLAG }; const int ref_frame[2] = { cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME, cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME }; - if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) || - !(cpi->ref_frame_flags & flag_list[ref_frame[1]])) + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]])) return 0; return 1; @@ -2324,7 +2346,7 @@ // a source or a ref frame should have an image pyramid allocated. // Check here so that issues can be caught early in debug mode #if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY - if (cpi->image_pyramid_levels > 0) { + if (cpi->alloc_pyramid) { assert(cpi->source->y_pyramid); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c index 94298c8..a8e4a88 100644 --- a/av1/encoder/encodeframe_utils.c +++ b/av1/encoder/encodeframe_utils.c
@@ -15,6 +15,7 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/rdopt.h" void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, @@ -306,6 +307,7 @@ // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + mi_addr->segment_id != AM_SEGMENT_ID_INACTIVE && !cpi->rc.rtc_external_ratectrl) { av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize, ctx->rd_stats.rate, ctx->rd_stats.dist, @@ -1398,6 +1400,11 @@ 36000 }; // ~3*3*(64*64) uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64) + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1; + avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1; + avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1; + } uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 src_y += src_offset; last_src_y += last_src_offset; @@ -1426,6 +1433,10 @@ if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1)) x->content_state_sb.low_sumdiff = 1; + if (tmp_sse > ((avg_source_sse_threshold_high * 7) >> 3) && + !x->content_state_sb.lighting_change && !x->content_state_sb.low_sumdiff) + x->sb_force_fixed_part = 0; + if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad || cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1) return; @@ -1753,6 +1764,11 @@ void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) { const int num_planes = av1_num_planes(cm); +#ifndef NDEBUG + for (int plane = 0; plane < num_planes; ++plane) { + assert(!mb->plane[plane].src_diff); + } +#endif for (int plane = 0; plane < num_planes; ++plane) { const int subsampling_xy = plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c index 07b6ffe..093eabc 100644 --- a/av1/encoder/encoder.c +++ b/av1/encoder/encoder.c
@@ -35,11 +35,13 @@ #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_scale/aom_scale.h" +#include "aom_util/aom_pthread.h" #if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG #include "av1/common/alloccommon.h" +#include "av1/common/debugmodes.h" #include "av1/common/filter.h" #include "av1/common/idct.h" #include "av1/common/reconinter.h" @@ -152,24 +154,34 @@ unsigned char *const active_map_4x4 = cpi->active_map.map; const int mi_rows = mi_params->mi_rows; const int mi_cols = mi_params->mi_cols; - const int row_scale = mi_size_high_log2[BLOCK_16X16]; - const int col_scale = mi_size_wide_log2[BLOCK_16X16]; cpi->active_map.update = 0; - assert(mi_rows % 2 == 0); - assert(mi_cols % 2 == 0); + cpi->rc.percent_blocks_inactive = 0; + assert(mi_rows % 2 == 0 && mi_rows > 0); + assert(mi_cols % 2 == 0 && mi_cols > 0); if (new_map_16x16) { - for (int r = 0; r < (mi_rows >> row_scale); ++r) { - for (int c = 0; c < (mi_cols >> col_scale); ++c) { - const uint8_t val = new_map_16x16[r * cols + c] + int num_samples = 0; + int num_blocks_inactive = 0; + for (int r = 0; r < mi_rows; r += 4) { + for (int c = 0; c < mi_cols; c += 4) { + const uint8_t val = new_map_16x16[(r >> 2) * cols + (c >> 2)] ? AM_SEGMENT_ID_ACTIVE : AM_SEGMENT_ID_INACTIVE; - active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val; - active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val; - active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val; - active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val; + num_samples++; + if (val == AM_SEGMENT_ID_INACTIVE) num_blocks_inactive++; + const int row_max = AOMMIN(4, mi_rows - r); + const int col_max = AOMMIN(4, mi_cols - c); + for (int x = 0; x < row_max; ++x) { + for (int y = 0; y < col_max; ++y) { + active_map_4x4[(r + x) * mi_cols + (c + y)] = val; + } + } } } cpi->active_map.enabled = 1; + cpi->active_map.update = 1; + assert(num_samples); + cpi->rc.percent_blocks_inactive = + (num_blocks_inactive * 100) / num_samples; } return 0; } @@ -642,14 +654,12 @@ cm->height = oxcf->frm_dim_cfg.height; cpi->is_dropped_frame = false; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; - initial_dimensions->width = cm->width; - initial_dimensions->height = cm->height; - - cpi->frame_size_related_setup_done = false; - alloc_compressor_data(cpi); + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + // Single thread case: use counts in common. cpi->td.counts = &cpi->counts; @@ -773,7 +783,6 @@ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; MACROBLOCK *const x = &cpi->td.mb; AV1LevelParams *const level_params = &cpi->ppi->level_params; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; @@ -913,8 +922,8 @@ cm->width = frm_dim_cfg->width; cm->height = frm_dim_cfg->height; - if (cm->width > initial_dimensions->width || - cm->height > initial_dimensions->height || is_sb_size_changed) { + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height || is_sb_size_changed) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); @@ -922,8 +931,8 @@ cpi->td.firstpass_ctx = NULL; alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); - initial_dimensions->width = cm->width; - initial_dimensions->height = cm->height; + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; } av1_update_frame_size(cpi); @@ -946,14 +955,9 @@ #if CONFIG_REALTIME_ONLY assert(!oxcf->tool_cfg.enable_global_motion); - cpi->image_pyramid_levels = 0; + cpi->alloc_pyramid = false; #else - if (oxcf->tool_cfg.enable_global_motion) { - cpi->image_pyramid_levels = - global_motion_pyr_levels[default_global_motion_method]; - } else { - cpi->image_pyramid_levels = 0; - } + cpi->alloc_pyramid = oxcf->tool_cfg.enable_global_motion; #endif // CONFIG_REALTIME_ONLY } @@ -1501,6 +1505,7 @@ cpi->mb_weber_stats = NULL; cpi->mb_delta_q = NULL; cpi->palette_pixel_num = 0; + cpi->scaled_last_source_available = 0; { const BLOCK_SIZE bsize = BLOCK_16X16; @@ -2072,8 +2077,8 @@ // TODO(chengchen): consider renaming this function as it is necessary // for the encoder to setup critical parameters, and it does not // deal with initial width any longer. -void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, - int subsampling_x, int subsampling_y) { +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; @@ -2090,7 +2095,8 @@ if (!is_stat_generation_stage(cpi)) { #if !CONFIG_REALTIME_ONLY - av1_tf_info_alloc(&cpi->ppi->tf_info, cpi); + if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi)) + return AOM_CODEC_MEM_ERROR; #endif // !CONFIG_REALTIME_ONLY } init_ref_frame_bufs(cpi); @@ -2100,6 +2106,7 @@ cpi->initial_mbs = cm->mi_params.MBs; cpi->frame_size_related_setup_done = true; } + return AOM_CODEC_OK; } #if CONFIG_AV1_TEMPORAL_DENOISING @@ -2119,12 +2126,14 @@ #endif // Returns 1 if the assigned width or height was <= 0. -int av1_set_size_literal(AV1_COMP *cpi, int width, int height) { +static int set_size_literal(AV1_COMP *cpi, int width, int height) { AV1_COMMON *cm = &cpi->common; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; - av1_check_initial_width(cpi, cm->seq_params->use_highbitdepth, - cm->seq_params->subsampling_x, - cm->seq_params->subsampling_y); + aom_codec_err_t err = av1_check_initial_width( + cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(cm->error, err, "av1_check_initial_width() failed"); + } if (width <= 0 || height <= 0) return 1; @@ -2135,8 +2144,8 @@ setup_denoiser_buffer(cpi); #endif - if (cm->width > initial_dimensions->width || - cm->height > initial_dimensions->height) { + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); @@ -2144,8 +2153,8 @@ cpi->td.firstpass_ctx = NULL; alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); - initial_dimensions->width = cm->width; - initial_dimensions->height = cm->height; + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; } alloc_mb_mode_info_buffers(cpi); @@ -2163,7 +2172,7 @@ if (width != cm->width || height != cm->height) { // There has been a change in the encoded frame size - av1_set_size_literal(cpi, width, height); + set_size_literal(cpi, width, height); // Recalculate 'all_lossless' in case super-resolution was (un)selected. cm->features.all_lossless = cm->features.coded_lossless && !av1_superres_scaled(cm); @@ -2206,7 +2215,7 @@ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, - NULL, cpi->image_pyramid_levels, 0)) + NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -2387,7 +2396,10 @@ const int use_loopfilter = is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc; - const int use_cdef = is_cdef_used(cm); + const int use_cdef = + is_cdef_used(cm) && (!cpi->active_map.enabled || + cpi->rc.percent_blocks_inactive <= + cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef); const int use_superres = av1_superres_scaled(cm); const int use_restoration = is_restoration_used(cm); @@ -2473,7 +2485,6 @@ const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg; SVC *const svc = &cpi->svc; const int resize_pending = is_frame_resize_pending(cpi); - int top_index = 0, bottom_index = 0, q = 0; YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source; InterpFilter filter_scaler = @@ -2497,7 +2508,8 @@ &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) { + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, + 0)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffer for source_last_TL0"); } @@ -2546,7 +2558,7 @@ cpi->source = av1_realloc_and_scale_if_required( cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true, - false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (frame_is_intra_only(cm) || resize_pending != 0) { const int current_size = (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2; @@ -2562,11 +2574,14 @@ memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv)); } - if (cpi->unscaled_last_source != NULL) { + if (cpi->scaled_last_source_available) { + cpi->last_source = &cpi->scaled_last_source; + cpi->scaled_last_source_available = 0; + } else if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler, phase_scaler, true, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } if (cpi->sf.rt_sf.use_temporal_noise_estimate) { @@ -2584,13 +2599,16 @@ // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger) // resized frame and ALTREF will be refreshed ~4 frames later, so both // references become available again after few frames. + // For superres: don't disable golden reference. if (svc->number_spatial_layers == 1) { - if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { - const YV12_BUFFER_CONFIG *const ref = - get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); - if (ref == NULL || ref->y_crop_width != cm->width || - ref->y_crop_height != cm->height) { - cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + if (!cpi->oxcf.superres_cfg.enable_superres) { + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (ref == NULL || ref->y_crop_width != cm->width || + ref->y_crop_height != cm->height) { + cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + } } } if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) { @@ -2640,12 +2658,8 @@ av1_setup_frame(cpi); } } - - if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) { - suppress_active_map(cpi); - av1_cyclic_refresh_setup(cpi); - } av1_apply_active_map(cpi); + if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_setup(cpi); if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); @@ -2660,26 +2674,26 @@ cm->cur_frame->seg.enabled = cm->seg.enabled; // This is for rtc temporal filtering case. - if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && - cm->current_frame.frame_type != KEY_FRAME) { + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf) { const SequenceHeader *seq_params = cm->seq_params; if (cpi->orig_source.buffer_alloc_sz == 0 || - cpi->last_source->y_width != cpi->source->y_width || - cpi->last_source->y_height != cpi->source->y_height) { + cpi->rc.prev_coded_width != cpi->oxcf.frm_dim_cfg.width || + cpi->rc.prev_coded_height != cpi->oxcf.frm_dim_cfg.height) { // Allocate a source buffer to store the true source for psnr calculation. if (aom_alloc_frame_buffer( &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, + 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled buffer"); } - aom_yv12_copy_y(cpi->source, &cpi->orig_source); - aom_yv12_copy_u(cpi->source, &cpi->orig_source); - aom_yv12_copy_v(cpi->source, &cpi->orig_source); + aom_yv12_copy_y(cpi->source, &cpi->orig_source, 1); + aom_yv12_copy_u(cpi->source, &cpi->orig_source, 1); + aom_yv12_copy_v(cpi->source, &cpi->orig_source, 1); } #if CONFIG_COLLECT_COMPONENT_TIMING @@ -2697,13 +2711,32 @@ update_motion_stat(cpi); // Adjust the refresh of the golden (longer-term) reference based on QP - // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode. + // selected for this frame. This is for CBR real-time mode, and only + // for single layer without usage of the set_ref_frame_config (so + // reference structure for 1 layer is set internally). if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 && svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl && + !cpi->ppi->rtc_ref.set_ref_frame_config && sf->rt_sf.gf_refresh_based_on_qp) av1_adjust_gf_refresh_qp_one_pass_rt(cpi); + // For non-svc: if scaling is required, copy scaled_source + // into scaled_last_source. + if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc && + cpi->scaled_source.y_buffer != NULL && + cpi->scaled_last_source.y_buffer != NULL && + cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width && + cpi->scaled_source.y_crop_height == + cpi->scaled_last_source.y_crop_height && + (cm->width != cpi->unscaled_source->y_crop_width || + cm->height != cpi->unscaled_source->y_crop_height)) { + cpi->scaled_last_source_available = 1; + aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source, 1); + aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source, 1); + aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source, 1); + } + #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_encode_frame_time); #endif @@ -2820,7 +2853,7 @@ } cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0, - false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); #if CONFIG_TUNE_BUTTERAUGLI if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { @@ -2840,7 +2873,7 @@ cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } int scale_references = 0; @@ -3522,9 +3555,6 @@ } #endif -extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc, - const char *filename); - /*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack * the bitstream * @@ -3809,6 +3839,8 @@ if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame && cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2) features->disable_cdf_update = 1; + else if (cpi->sf.rt_sf.selective_cdf_update) + features->disable_cdf_update = selective_disable_cdf_rtc(cpi); else features->disable_cdf_update = 0; break; @@ -4014,7 +4046,7 @@ } #if CONFIG_DENOISE -static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, +static int apply_denoise_2d(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *sd, int block_size, float noise_level, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; @@ -4022,16 +4054,16 @@ cpi->denoise_and_model = aom_denoise_and_model_alloc( cm->seq_params->bit_depth, block_size, noise_level); if (!cpi->denoise_and_model) { - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, - "Error allocating denoise and model"); + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating denoise and model"); return -1; } } if (!cpi->film_grain_table) { cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); if (!cpi->film_grain_table) { - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, - "Error allocating grain table"); + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating grain table"); return -1; } memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); @@ -4049,7 +4081,7 @@ #endif int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + const YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; @@ -4111,10 +4143,8 @@ #endif // CONFIG_DENOISE if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time, - use_highbitdepth, cpi->image_pyramid_levels, - frame_flags)) { - aom_internal_error(cm->error, AOM_CODEC_ERROR, - "av1_lookahead_push() failed"); + use_highbitdepth, cpi->alloc_pyramid, frame_flags)) { + aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed"); res = -1; } #if CONFIG_INTERNAL_STATS @@ -4131,21 +4161,21 @@ // header. if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && (subsampling_x != 1 || subsampling_y != 1)) { - aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, - "Non-4:2:0 color format requires profile 1 or 2"); + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 2"); res = -1; } if ((seq_params->profile == PROFILE_1) && !(subsampling_x == 0 && subsampling_y == 0)) { - aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, - "Profile 1 requires 4:4:4 color format"); + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 1 requires 4:4:4 color format"); res = -1; } if ((seq_params->profile == PROFILE_2) && (seq_params->bit_depth <= AOM_BITS_10) && !(subsampling_x == 1 && subsampling_y == 0)) { - aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, - "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); res = -1; } @@ -4696,7 +4726,7 @@ aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number); } #endif - if (cpi->ppi->use_svc && cpi->ppi->number_spatial_layers > 1) { + if (cpi->ppi->use_svc) { av1_one_pass_cbr_svc_start_layer(cpi); }
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h index 0a8bcde..b0fc5cd 100644 --- a/av1/encoder/encoder.h +++ b/av1/encoder/encoder.h
@@ -21,6 +21,7 @@ #include "config/aom_config.h" #include "aom/aomcx.h" +#include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" @@ -36,6 +37,7 @@ #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" @@ -73,7 +75,6 @@ #endif #include "aom/internal/aom_codec_internal.h" -#include "aom_util/aom_thread.h" #ifdef __cplusplus extern "C" { @@ -1544,6 +1545,13 @@ */ bool firstpass_mt_exit; + /*! + * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker + * thread that encounters an error in order to abort the processing of other + * worker threads. + */ + bool mb_wiener_mt_exit; + #if CONFIG_MULTITHREAD /*! * Mutex lock used while dispatching jobs. @@ -2081,20 +2089,6 @@ } GlobalMotionInfo; /*! - * \brief Initial frame dimensions - * - * Tracks the frame dimensions using which: - * - Frame buffers (like altref and util frame buffers) were allocated - * - Motion estimation related initializations were done - * This structure is helpful to reallocate / reinitialize the above when there - * is a change in frame dimensions. - */ -typedef struct { - int width; /*!< initial width */ - int height; /*!< initial height */ -} InitialDimensions; - -/*! * \brief Flags related to interpolation filter search */ typedef struct { @@ -2799,7 +2793,7 @@ double total_blockiness; double worst_blockiness; - int total_bytes; + uint64_t total_bytes; double summed_quality; double summed_weights; double summed_quality_hbd; @@ -3163,11 +3157,18 @@ FRAME_INDEX_SET frame_index_set; /*! - * Structure to store the cm->width and cm->height in the last call - * of alloc_compressor_data(). - * TODO(chengchen): rename this variable or delete it. + * Stores the cm->width in the last call of alloc_compressor_data(). Helps + * determine whether compressor data should be reallocated when cm->width + * changes. */ - InitialDimensions initial_dimensions; + int data_alloc_width; + + /*! + * Stores the cm->height in the last call of alloc_compressor_data(). Helps + * determine whether compressor data should be reallocated when cm->height + * changes. + */ + int data_alloc_height; /*! * Number of MBs in the full-size frame; to be used to @@ -3631,10 +3632,10 @@ unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL]; /*! - * Number of downsampling pyramid levels to allocate for each frame + * Should we allocate a downsampling pyramid for each frame buffer? * This is currently only used for global motion */ - int image_pyramid_levels; + bool alloc_pyramid; #if CONFIG_SALIENCY_MAP /*! @@ -3653,6 +3654,12 @@ * fast encoding pass in av1_determine_sc_tools_with_encoding(). */ int palette_pixel_num; + + /*! + * Flag to indicate scaled_last_source is available, + * so scaling is not needed for last_source. + */ + int scaled_last_source_available; } AV1_COMP; /*! @@ -3756,8 +3763,8 @@ void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf, bool sb_size_changed); -void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, - int subsampling_x, int subsampling_y); +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y); void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, const AV1EncoderConfig *oxcf, int use_svc); @@ -3802,7 +3809,7 @@ * copy of the pointer. */ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + const YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); /*!\brief Encode a frame @@ -3822,7 +3829,9 @@ * \retval #AOM_CODEC_OK * \retval -1 * No frame encoded; more input is required. - * \retval #AOM_CODEC_ERROR + * \retval "A nonzero (positive) aom_codec_err_t code" + * The encoding failed with the error. Sets the error code and error message + * in \c cpi->common.error. */ int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data); @@ -3852,8 +3861,6 @@ int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); -int av1_set_size_literal(AV1_COMP *cpi, int width, int height); - void av1_set_frame_size(AV1_COMP *cpi, int width, int height); void av1_set_mv_search_params(AV1_COMP *cpi); @@ -4304,7 +4311,7 @@ const AV1_COMMON *const cm = &cpi->common; return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) && - cm->show_frame; + cm->show_frame && !cpi->is_dropped_frame; } static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h index d0fd782..f24d4b0 100644 --- a/av1/encoder/encoder_alloc.h +++ b/av1/encoder/encoder_alloc.h
@@ -77,7 +77,10 @@ av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf, cm->error); - av1_setup_sms_tree(cpi, &cpi->td); + if (av1_setup_sms_tree(cpi, &cpi->td)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } cpi->td.firstpass_ctx = av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf); if (!cpi->td.firstpass_ctx) @@ -182,11 +185,15 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; TokenInfo *token_info = &cpi->token_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int num_planes = av1_num_planes(cm); dealloc_context_buffers_ext(&cpi->mbmi_ext_info); aom_free(cpi->tile_data); cpi->tile_data = NULL; + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; // Delete sementation map aom_free(cpi->enc_seg.map); @@ -432,8 +439,7 @@ &cpi->scaled_source, scaled_width, scaled_height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, NULL, NULL, NULL, - cpi->image_pyramid_levels, 0)) + cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer"); assert(cpi->scaled_source.y_crop_width == scaled_width); @@ -458,61 +464,62 @@ for (int t = 1; t < p_mt_info->num_workers; ++t) { EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t]; thread_data->td = thread_data->original_td; - aom_free(thread_data->td->tctx); - aom_free(thread_data->td->palette_buffer); - aom_free(thread_data->td->tmp_conv_dst); - release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer); + ThreadData *const td = thread_data->td; + if (!td) continue; + aom_free(td->tctx); + aom_free(td->palette_buffer); + aom_free(td->tmp_conv_dst); + release_compound_type_rd_buffers(&td->comp_rd_buffer); for (int j = 0; j < 2; ++j) { - aom_free(thread_data->td->tmp_pred_bufs[j]); + aom_free(td->tmp_pred_bufs[j]); } - aom_free(thread_data->td->pixel_gradient_info); - aom_free(thread_data->td->src_var_info_of_4x4_sub_blocks); - release_obmc_buffers(&thread_data->td->obmc_buffer); - aom_free(thread_data->td->vt64x64); + aom_free(td->pixel_gradient_info); + aom_free(td->src_var_info_of_4x4_sub_blocks); + release_obmc_buffers(&td->obmc_buffer); + aom_free(td->vt64x64); for (int x = 0; x < 2; x++) { for (int y = 0; y < 2; y++) { - aom_free(thread_data->td->hash_value_buffer[x][y]); - thread_data->td->hash_value_buffer[x][y] = NULL; + aom_free(td->hash_value_buffer[x][y]); + td->hash_value_buffer[x][y] = NULL; } } - aom_free(thread_data->td->mv_costs_alloc); - thread_data->td->mv_costs_alloc = NULL; - aom_free(thread_data->td->dv_costs_alloc); - thread_data->td->dv_costs_alloc = NULL; - aom_free(thread_data->td->counts); - av1_free_pmc(thread_data->td->firstpass_ctx, num_planes); - thread_data->td->firstpass_ctx = NULL; - av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf); - av1_free_sms_tree(thread_data->td); + aom_free(td->mv_costs_alloc); + td->mv_costs_alloc = NULL; + aom_free(td->dv_costs_alloc); + td->dv_costs_alloc = NULL; + aom_free(td->counts); + av1_free_pmc(td->firstpass_ctx, num_planes); + td->firstpass_ctx = NULL; + av1_free_shared_coeff_buffer(&td->shared_coeff_buf); + av1_free_sms_tree(td); // This call ensures that the buffers allocated by tf_alloc_and_reset_data() // in prepare_tf_workers() for MT encode are freed in case an error is // encountered during temporal filtering (due to early termination // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be // invoked). - if (t < num_tf_workers) - tf_dealloc_data(&thread_data->td->tf_data, is_highbitdepth); + if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth); // This call ensures that tpl_tmp_buffers for MT encode are freed in case of // an error during tpl. - if (t < num_tpl_workers) - tpl_dealloc_temp_buffers(&thread_data->td->tpl_tmp_buffers); + if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); // This call ensures that the buffers in gm_data for MT encode are freed in // case of an error during gm. - gm_dealloc_data(&thread_data->td->gm_data); - av1_dealloc_mb_data(&thread_data->td->mb, num_planes); - aom_free(thread_data->td->mb.sb_stats_cache); - thread_data->td->mb.sb_stats_cache = NULL; - aom_free(thread_data->td->mb.sb_fp_stats); - thread_data->td->mb.sb_fp_stats = NULL; + gm_dealloc_data(&td->gm_data); + av1_dealloc_mb_data(&td->mb, num_planes); + aom_free(td->mb.sb_stats_cache); + td->mb.sb_stats_cache = NULL; + aom_free(td->mb.sb_fp_stats); + td->mb.sb_fp_stats = NULL; #if CONFIG_PARTITION_SEARCH_ORDER - aom_free(thread_data->td->mb.rdcost); - thread_data->td->mb.rdcost = NULL; + aom_free(td->mb.rdcost); + td->mb.rdcost = NULL; #endif - av1_free_pc_tree_recursive(thread_data->td->pc_root, num_planes, 0, 0, - SEARCH_PARTITION); - thread_data->td->pc_root = NULL; - av1_dealloc_mb_wiener_var_pred_buf(thread_data->td); - aom_free(thread_data->td); + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION); + td->pc_root = NULL; + av1_dealloc_mb_wiener_var_pred_buf(td); + aom_free(td); + thread_data->td = NULL; + thread_data->original_td = NULL; } }
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c index f9e446b..1c04df7 100644 --- a/av1/encoder/encoder_utils.c +++ b/av1/encoder/encoder_utils.c
@@ -9,8 +9,11 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <string.h> + #include "aom/aomcx.h" +#include "av1/common/av1_common_int.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder.h" @@ -421,11 +424,13 @@ struct segmentation *const seg = &cpi->common.seg; unsigned char *const seg_map = cpi->enc_seg.map; const unsigned char *const active_map = cpi->active_map.map; - int i; assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); - if (frame_is_intra_only(&cpi->common)) { + // Disable the active_maps on intra_only frames or if the + // input map for the current frame has no inactive blocks. + if (frame_is_intra_only(&cpi->common) || + cpi->rc.percent_blocks_inactive == 0) { cpi->active_map.enabled = 0; cpi->active_map.update = 1; } @@ -434,8 +439,7 @@ if (cpi->active_map.enabled) { const int num_mis = cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; - for (i = 0; i < num_mis; ++i) - if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + memcpy(seg_map, active_map, sizeof(active_map[0]) * num_mis); av1_enable_segmentation(seg); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); @@ -706,6 +710,14 @@ if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref) continue; } + // For RTC with superres on: golden reference only needs to be scaled + // if it was refreshed in previous frame. + if (is_one_pass_rt_params(cpi) && + cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME && + cpi->rc.frame_num_last_gf_refresh < + (int)cm->current_frame.frame_number - 1) { + continue; + } if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { // Replace the reference buffer with a copy having a thicker border, @@ -717,7 +729,7 @@ RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); if (aom_yv12_realloc_with_new_border( &ref_fb->buf, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, cpi->image_pyramid_levels, + cm->features.byte_alignment, cpi->alloc_pyramid, num_planes) != 0) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -741,7 +753,7 @@ &new_fb->buf, cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) { + cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) { if (force_scaling) { // Release the reference acquired in the get_free_fb() call above. --new_fb->ref_count; @@ -825,21 +837,7 @@ ? BLOCK_128X128 : BLOCK_64X64; } else if (oxcf->mode == REALTIME) { - if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) { - const TileConfig *const tile_cfg = &oxcf->tile_cfg; - const int num_tiles = - (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows); - // For multi-thread encode: if the number of (128x128) superblocks - // per tile is low use 64X64 superblock. - if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 && - oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 && - (width * height) / (128 * 128 * num_tiles) <= 38) - return BLOCK_64X64; - else - return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64; - } else { - return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64; - } + return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64; } // TODO(any): Possibly could improve this with a heuristic. @@ -1079,12 +1077,12 @@ cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, - 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } av1_setup_frame(cpi);
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h index 196676e..113f62a 100644 --- a/av1/encoder/encoder_utils.h +++ b/av1/encoder/encoder_utils.h
@@ -83,9 +83,10 @@ static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) { aom_free(mi_params->mi_alloc); mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; aom_free(mi_params->mi_grid_base); mi_params->mi_grid_base = NULL; - mi_params->mi_alloc_size = 0; + mi_params->mi_grid_size = 0; aom_free(mi_params->tx_type_map); mi_params->tx_type_map = NULL; } @@ -1013,10 +1014,23 @@ } static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) { - // TODO(isbs): only refresh the necessary frames, rather than all of them + // Scaled references should only need to be released under certain conditions: + // if the reference will be updated, or if the scaled reference has same + // resolution. For now only apply this to Golden for non-svc RTC mode. + AV1_COMMON *const cm = &cpi->common; + const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0; + bool release_golden = true; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; - if (buf != NULL) { + const int golden_ref = (i == GOLDEN_FRAME - 1); + if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc && + buf != NULL) { + const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME); + const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width && + buf->buf.y_crop_height == ref->buf.y_crop_height; + release_golden = refresh_golden || same_resoln; + } + if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) { --buf->ref_count; cpi->scaled_ref_buf[i] = NULL; }
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c index 602a6c4..701c548 100644 --- a/av1/encoder/encodetxb.c +++ b/av1/encoder/encodetxb.c
@@ -76,9 +76,13 @@ void av1_free_txb_buf(AV1_COMP *cpi) { CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; aom_free(cpi->coeff_buffer_base); + cpi->coeff_buffer_base = NULL; aom_free(coeff_buf_pool->tcoeff); + coeff_buf_pool->tcoeff = NULL; aom_free(coeff_buf_pool->eobs); + coeff_buf_pool->eobs = NULL; aom_free(coeff_buf_pool->entropy_ctx); + coeff_buf_pool->entropy_ctx = NULL; } static void write_golomb(aom_writer *w, int level) { @@ -130,14 +134,14 @@ } #if CONFIG_ENTROPY_STATS -void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, - TX_CLASS tx_class, PLANE_TYPE plane, - FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, - uint8_t allow_update_cdf) { +static void update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, PLANE_TYPE plane, + FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { #else -void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, - PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, - uint8_t allow_update_cdf) { +static void update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, + PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, + uint8_t allow_update_cdf) { #endif int eob_extra; const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); @@ -619,11 +623,11 @@ td->rd_counts.tx_type_used[tx_size][tx_type]++; #if CONFIG_ENTROPY_STATS - av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, - td->counts, allow_update_cdf); + update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, allow_update_cdf); #else - av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, - allow_update_cdf); + update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, + allow_update_cdf); #endif DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); @@ -781,8 +785,8 @@ #if CONFIG_ENTROPY_STATS FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, - td->counts, 0 /*allow_update_cdf*/); + update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, 0 /*allow_update_cdf*/); DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c index c15e396..356aa03 100644 --- a/av1/encoder/ethread.c +++ b/av1/encoder/ethread.c
@@ -10,12 +10,16 @@ */ #include <assert.h> +#include <stdbool.h> + +#include "aom_util/aom_pthread.h" #include "av1/common/warped_motion.h" #include "av1/common/thread_common.h" #include "av1/encoder/allintra_vis.h" #include "av1/encoder/bitstream.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encoder.h" @@ -151,7 +155,13 @@ if (sig) { pthread_mutex_lock(&row_mt_sync->mutex_[r]); - row_mt_sync->num_finished_cols[r] = cur; + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + row_mt_sync->num_finished_cols[r] = + AOMMAX(row_mt_sync->num_finished_cols[r], cur); pthread_cond_signal(&row_mt_sync->cond_[r]); pthread_mutex_unlock(&row_mt_sync->mutex_[r]); @@ -246,7 +256,6 @@ row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows); - this_tile->row_ctx = NULL; if (alloc_row_ctx) { assert(max_cols > 0); const int num_row_ctx = AOMMAX(1, (max_cols - 1)); @@ -261,13 +270,9 @@ cm, enc_row_mt->num_tile_cols_done, aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows)); - enc_row_mt->allocated_tile_cols = tile_cols; - enc_row_mt->allocated_tile_rows = tile_rows; enc_row_mt->allocated_rows = max_rows; enc_row_mt->allocated_cols = max_cols - 1; enc_row_mt->allocated_sb_rows = sb_rows; - enc_row_mt->row_mt_exit = false; - enc_row_mt->firstpass_mt_exit = false; } void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { @@ -284,15 +289,16 @@ av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); - if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx); + if (cpi->oxcf.algo_cfg.cdf_update_mode) { + aom_free(this_tile->row_ctx); + this_tile->row_ctx = NULL; + } } } aom_free(enc_row_mt->num_tile_cols_done); enc_row_mt->num_tile_cols_done = NULL; enc_row_mt->allocated_rows = 0; enc_row_mt->allocated_cols = 0; - enc_row_mt->allocated_tile_cols = 0; - enc_row_mt->allocated_tile_rows = 0; enc_row_mt->allocated_sb_rows = 0; } @@ -574,6 +580,11 @@ } } +static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc, + const int filter_level[2]) { + return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]); +} + static int enc_row_mt_worker_hook(void *arg1, void *unused) { EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; @@ -588,6 +599,9 @@ AV1LfSync *const lf_sync = thread_data->lf_sync; MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; xd->error_info = error_info; + AV1_COMMON *volatile const cm = &cpi->common; + volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled( + cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level); // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 @@ -604,7 +618,7 @@ #endif set_encoding_done(cpi); - if (cpi->mt_info.pipeline_lpf_mt_with_enc) { + if (do_pipelined_lpf_mt_with_enc) { #if CONFIG_MULTITHREAD pthread_mutex_lock(lf_sync->job_mutex); lf_sync->lf_mt_exit = true; @@ -617,7 +631,6 @@ } error_info->setjmp = 1; - AV1_COMMON *const cm = &cpi->common; const int mib_size_log2 = cm->seq_params->mib_size_log2; int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; @@ -717,9 +730,7 @@ pthread_mutex_unlock(enc_row_mt_mutex_); #endif } - if (cpi->mt_info.pipeline_lpf_mt_with_enc && - (cm->lf.filter_level[PLANE_TYPE_Y] || - cm->lf.filter_level[PLANE_TYPE_UV])) { + if (do_pipelined_lpf_mt_with_enc) { // Loop-filter a superblock row if encoding of the current and next // superblock row is complete. // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving @@ -831,6 +842,11 @@ AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + aom_internal_error_copy(&cpi->ppi->error, cm->error); + } + cm->error->setjmp = 1; // Initialize enc row MT object. if (is_first_pass || cpi->oxcf.row_mt == 1) { AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; @@ -892,7 +908,6 @@ aom_malloc(sizeof(*(tpl_row_mt->mutex_)))); if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL); } - tpl_row_mt->tpl_mt_exit = false; #if !CONFIG_REALTIME_ONLY if (is_restoration_used(cm)) { @@ -919,6 +934,7 @@ if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); } } + cm->error->setjmp = 0; } #endif // CONFIG_MULTITHREAD @@ -951,48 +967,48 @@ if (i > 0) { // Allocate thread data. - AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td, - aom_memalign(32, sizeof(*thread_data->td))); - av1_zero(*thread_data->td); - thread_data->original_td = thread_data->td; + ThreadData *td; + AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td))); + av1_zero(*td); + thread_data->original_td = thread_data->td = td; // Set up shared coeff buffers. - av1_setup_shared_coeff_buffer( - &ppi->seq_params, &thread_data->td->shared_coeff_buf, &ppi->error); - AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->tmp_conv_dst, - aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * - sizeof(*thread_data->td->tmp_conv_dst))); + av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf, + &ppi->error); + AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*td->tmp_conv_dst))); if (i < p_mt_info->num_mod_workers[MOD_FP]) { // Set up firstpass PICK_MODE_CONTEXT. - thread_data->td->firstpass_ctx = av1_alloc_pmc( - ppi->cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf); - if (!thread_data->td->firstpass_ctx) + td->firstpass_ctx = + av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf); + if (!td->firstpass_ctx) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } if (!is_first_pass && i < num_enc_workers) { // Set up sms_tree. - av1_setup_sms_tree(ppi->cpi, thread_data->td); + if (av1_setup_sms_tree(ppi->cpi, td)) { + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } for (int x = 0; x < 2; x++) for (int y = 0; y < 2; y++) AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->hash_value_buffer[x][y], - (uint32_t *)aom_malloc( - AOM_BUFFER_SIZE_FOR_BLOCK_HASH * - sizeof(*thread_data->td->hash_value_buffer[0][0]))); + &ppi->error, td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*td->hash_value_buffer[0][0]))); // Allocate frame counters in thread data. - AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td->counts, - aom_calloc(1, sizeof(*thread_data->td->counts))); + AOM_CHECK_MEM_ERROR(&ppi->error, td->counts, + aom_calloc(1, sizeof(*td->counts))); // Allocate buffers used by palette coding mode. - AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->palette_buffer, - aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); + AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer, + aom_memalign(16, sizeof(*td->palette_buffer))); // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are // used in inter frames to store intermediate inter mode prediction @@ -1000,26 +1016,23 @@ // memory allocations for these buffers are avoided for allintra // encoding mode. if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) { - alloc_obmc_buffers(&thread_data->td->obmc_buffer, &ppi->error); + alloc_obmc_buffers(&td->obmc_buffer, &ppi->error); - alloc_compound_type_rd_buffers(&ppi->error, - &thread_data->td->comp_rd_buffer); + alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer); for (int j = 0; j < 2; ++j) { AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->tmp_pred_bufs[j], - aom_memalign(32, - 2 * MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->tmp_pred_bufs[j]))); + &ppi->error, td->tmp_pred_bufs[j], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*td->tmp_pred_bufs[j]))); } } if (is_gradient_caching_for_hog_enabled(ppi->cpi)) { const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome; - AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->pixel_gradient_info, - aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) * - plane_types * MAX_SB_SQUARE)); + AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info, + aom_malloc(sizeof(*td->pixel_gradient_info) * + plane_types * MAX_SB_SQUARE)); } if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) { @@ -1028,18 +1041,17 @@ mi_size_wide[sb_size] * mi_size_high[sb_size]; AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->src_var_info_of_4x4_sub_blocks, - aom_malloc( - sizeof(*thread_data->td->src_var_info_of_4x4_sub_blocks) * - mi_count_in_sb)); + &ppi->error, td->src_var_info_of_4x4_sub_blocks, + aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) * + mi_count_in_sb)); } if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { const int num_64x64_blocks = (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->vt64x64, - aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks)); + &ppi->error, td->vt64x64, + aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks)); } } } @@ -1076,7 +1088,7 @@ &ppi->error, p_mt_info->tile_thr_data, aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data))); - for (int i = num_workers - 1; i >= 0; i--) { + for (int i = 0; i < num_workers; ++i) { AVxWorker *const worker = &p_mt_info->workers[i]; EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; @@ -1112,7 +1124,8 @@ // This function returns 1 if frame parallel encode is supported for // the current configuration. Returns 0 otherwise. -static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { +static AOM_INLINE int is_fpmt_config(const AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf) { // FPMT is enabled for AOM_Q and AOM_VBR. // TODO(Tarun): Test and enable resize config. if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) { @@ -1150,7 +1163,7 @@ } int av1_check_fpmt_config(AV1_PRIMARY *const ppi, - AV1EncoderConfig *const oxcf) { + const AV1EncoderConfig *const oxcf) { if (is_fpmt_config(ppi, oxcf)) return 1; // Reset frame parallel configuration for unsupported config if (ppi->num_fp_contexts > 1) { @@ -1247,6 +1260,9 @@ return workers_per_frame; } +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared); + // Prepare level 1 workers. This function is only called for // parallel_frame_count > 1. This function populates the mt_info structure of // frame level contexts appropriately by dividing the total number of available @@ -1262,17 +1278,30 @@ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; int num_workers = p_mt_info->num_workers; - int frame_idx = 0; - int i = 0; + volatile int frame_idx = 0; + volatile int i = 0; while (i < num_workers) { // Assign level 1 worker AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] = &p_mt_info->workers[i]; AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; MultiThreadInfo *mt_info = &cur_cpi->mt_info; - AV1_COMMON *const cm = &cur_cpi->common; - const int num_planes = av1_num_planes(cm); + // This 'aom_internal_error_info' pointer is not derived from the local + // pointer ('AV1_COMMON *const cm') to silence the compiler warning + // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]". + struct aom_internal_error_info *const error = cur_cpi->common.error; + // The jmp_buf is valid only within the scope of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error->jmp)) { + error->setjmp = 0; + restore_workers_after_fpmt(ppi, parallel_frame_count, i); + aom_internal_error_copy(&ppi->error, error); + } + error->setjmp = 1; + + AV1_COMMON *const cm = &cur_cpi->common; // Assign start of level 2 worker pool mt_info->workers = &p_mt_info->workers[i]; mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i]; @@ -1281,13 +1310,14 @@ num_workers - i, parallel_frame_count - frame_idx); for (int j = MOD_FP; j < NUM_MT_MODULES; j++) { mt_info->num_mod_workers[j] = - AOMMIN(mt_info->num_workers, ppi->p_mt_info.num_mod_workers[j]); + AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]); } - if (ppi->p_mt_info.cdef_worker != NULL) { - mt_info->cdef_worker = &ppi->p_mt_info.cdef_worker[i]; + if (p_mt_info->cdef_worker != NULL) { + mt_info->cdef_worker = &p_mt_info->cdef_worker[i]; // Back up the original cdef_worker pointers. mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf; + const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; plane++) mt_info->restore_state_buf.cdef_colbuf[plane] = mt_info->cdef_worker->colbuf[plane]; @@ -1308,6 +1338,8 @@ } #endif + i += mt_info->num_workers; + // At this stage, the thread specific CDEF buffers for the current frame's // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has // already been allocated across parallel frames. @@ -1320,7 +1352,7 @@ ? first_cpi_data : &ppi->parallel_frames_data[frame_idx - 1]; frame_idx++; - i += mt_info->num_workers; + error->setjmp = 0; } p_mt_info->p_num_workers = parallel_frame_count; } @@ -1340,25 +1372,24 @@ } // Restore worker states after parallel encode. -static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi, - int parallel_frame_count) { +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) { assert(parallel_frame_count <= ppi->num_fp_contexts && parallel_frame_count > 1); (void)parallel_frame_count; PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; - int num_workers = p_mt_info->num_workers; int frame_idx = 0; int i = 0; - while (i < num_workers) { + while (i < num_fpmt_workers_prepared) { AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; MultiThreadInfo *mt_info = &cur_cpi->mt_info; const AV1_COMMON *const cm = &cur_cpi->common; const int num_planes = av1_num_planes(cm); // Restore the original cdef_worker pointers. - if (ppi->p_mt_info.cdef_worker != NULL) { + if (p_mt_info->cdef_worker != NULL) { mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf; for (int plane = 0; plane < num_planes; plane++) mt_info->cdef_worker->colbuf[plane] = @@ -1388,7 +1419,7 @@ int num_workers = ppi->p_mt_info.p_num_workers; int had_error = 0; // Points to error in the earliest display order frame in the parallel set. - const struct aom_internal_error_info *error; + const struct aom_internal_error_info *error = NULL; // Encoding ends. for (int i = num_workers - 1; i >= 0; --i) { @@ -1399,10 +1430,10 @@ } } - restore_workers_after_fpmt(ppi, frames_in_parallel_set); + restore_workers_after_fpmt(ppi, frames_in_parallel_set, + ppi->p_mt_info.num_workers); - if (had_error) - aom_internal_error(&ppi->error, error->error_code, "%s", error->detail); + if (had_error) aom_internal_error_copy(&ppi->error, error); } static int get_compressed_data_hook(void *arg1, void *arg2) { @@ -1416,8 +1447,8 @@ // This function encodes the raw frame data for each frame in parallel encode // set, and outputs the frame bit stream to the designated buffers. -int av1_compress_parallel_frames(AV1_PRIMARY *const ppi, - AV1_COMP_DATA *const first_cpi_data) { +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data) { // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf // corresponding to frames in the current parallel encode set. int ref_buffers_used_map = 0; @@ -1435,7 +1466,6 @@ } av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool, ref_buffers_used_map); - return AOM_CODEC_OK; } static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info, @@ -1472,9 +1502,7 @@ } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); // Restore xd->error_info of the main thread back to cm->error so that the // multithreaded code, when executed using a single thread, has a valid @@ -1648,13 +1676,10 @@ thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; - } - - if (thread_data->td != &cpi->td) { // Before encoding a frame, copy the thread data from cpi. thread_data->td->mb = cpi->td.mb; - av1_alloc_src_diff_buf(cm, &thread_data->td->mb); } + av1_alloc_src_diff_buf(cm, &thread_data->td->mb); } } #endif @@ -1856,8 +1881,9 @@ const int plane_start = 0; const int plane_end = av1_num_planes(cm); int planes_to_lf[MAX_MB_PLANE]; - if ((lf->filter_level[PLANE_TYPE_Y] || lf->filter_level[PLANE_TYPE_UV]) && - check_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end)) { + if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc, + lf->filter_level)) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); int lpf_opt_level = get_lpf_opt_level(&cpi->sf); assert(lpf_opt_level == 2); @@ -1923,6 +1949,7 @@ sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); memset(enc_row_mt->num_tile_cols_done, 0, sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame); + enc_row_mt->row_mt_exit = false; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { @@ -2001,6 +2028,7 @@ memset(thread_id_to_tile_id, -1, sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); + enc_row_mt->firstpass_mt_exit = false; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { @@ -2082,7 +2110,13 @@ if (sig) { pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]); - tpl_row_mt_sync->num_finished_cols[r] = cur; + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + tpl_row_mt_sync->num_finished_cols[r] = + AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur); pthread_cond_signal(&tpl_row_mt_sync->cond_[r]); pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]); @@ -2197,8 +2231,8 @@ } // Allocate memory for tpl row synchronization. -void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm, - int mb_rows) { +static void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm, + int mb_rows) { tpl_sync->rows = mb_rows; #if CONFIG_MULTITHREAD { @@ -2299,6 +2333,7 @@ av1_tpl_alloc(tpl_sync, cm, mb_rows); } tpl_sync->num_threads_working = num_workers; + mt_info->tpl_row_mt.tpl_mt_exit = false; // Initialize cur_mb_col to -1 for all MB rows. memset(tpl_sync->num_finished_cols, -1, @@ -2487,7 +2522,7 @@ static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) { GlobalMotionInfo *gm_info = &cpi->gm_info; - JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; int total_refs = gm_info->num_ref_frames[cur_dir]; int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; @@ -2518,7 +2553,7 @@ AV1_COMP *cpi = thread_data->cpi; GlobalMotionInfo *gm_info = &cpi->gm_info; AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; - JobInfo *job_info = &gm_sync->job_info; + GlobalMotionJobInfo *job_info = &gm_sync->job_info; int thread_id = thread_data->thread_id; GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; #if CONFIG_MULTITHREAD @@ -2656,7 +2691,7 @@ // Implements multi-threading for global motion. void av1_global_motion_estimation_mt(AV1_COMP *cpi) { - JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; av1_zero(*job_info); @@ -2713,6 +2748,28 @@ } } +static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL); + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; + const int mt_unit_cols = + (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + + // Update the wiener variance computation of every row in the frame to + // indicate that it is complete in order to avoid dependent workers waiting + // indefinitely. + for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows; + mi_row += mb_step, ++mt_thread_id) { + intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_cols - 1, mt_unit_cols); + } +} + static int cal_mb_wiener_var_hook(void *arg1, void *unused) { (void)unused; EncWorkerData *const thread_data = (EncWorkerData *)arg1; @@ -2726,25 +2783,44 @@ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; (void)enc_row_mt; #if CONFIG_MULTITHREAD - pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; + pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_; #endif + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); + enc_row_mt->mb_wiener_mt_exit = true; + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + set_mb_wiener_var_calc_done(cpi); + return 0; + } + error_info->setjmp = 1; DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); double sum_rec_distortion = 0; double sum_est_rate = 0; - int has_jobs = 1; - while (has_jobs) { + while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD - pthread_mutex_lock(enc_row_mt_mutex_); + pthread_mutex_lock(enc_row_mt_mutex); #endif - has_jobs = - get_next_job_allintra(intra_row_mt_sync, cpi->common.mi_params.mi_rows, - ¤t_mi_row, mb_step); + int has_jobs = enc_row_mt->mb_wiener_mt_exit + ? 0 + : get_next_job_allintra(intra_row_mt_sync, + cpi->common.mi_params.mi_rows, + ¤t_mi_row, mb_step); #if CONFIG_MULTITHREAD - pthread_mutex_unlock(enc_row_mt_mutex_); + pthread_mutex_unlock(enc_row_mt_mutex); #endif if (!has_jobs) break; // TODO(chengchen): properly accumulate the distortion and rate. @@ -2753,13 +2829,14 @@ &sum_est_rate, thread_data->td->wiener_tmp_pred_buf); #if CONFIG_MULTITHREAD - pthread_mutex_lock(enc_row_mt_mutex_); + pthread_mutex_lock(enc_row_mt_mutex); #endif intra_row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD - pthread_mutex_unlock(enc_row_mt_mutex_); + pthread_mutex_unlock(enc_row_mt_mutex); #endif } + error_info->setjmp = 0; return 1; } @@ -2799,6 +2876,7 @@ intra_row_mt_sync->next_mi_row = 0; memset(intra_row_mt_sync->num_finished_cols, -1, sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows); + mt_info->enc_row_mt.mb_wiener_mt_exit = false; prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers); launch_workers(mt_info, num_workers); @@ -3053,6 +3131,7 @@ AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; pack_bs_sync->next_job_idx = 0; + pack_bs_sync->pack_bs_mt_exit = false; PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; // Reset tile order data of pack bitstream @@ -3188,6 +3267,7 @@ cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; } // Checks if a job is available. If job is available,
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h index f3f8629..138811c 100644 --- a/av1/encoder/ethread.h +++ b/av1/encoder/ethread.h
@@ -122,10 +122,11 @@ int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf); -int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf); +int av1_check_fpmt_config(AV1_PRIMARY *const ppi, + const AV1EncoderConfig *const oxcf); -int av1_compress_parallel_frames(AV1_PRIMARY *const ppi, - AV1_COMP_DATA *const first_cpi_data); +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data); #ifdef __cplusplus } // extern "C" #endif
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c index 3631113..b94a507 100644 --- a/av1/encoder/firstpass.c +++ b/av1/encoder/firstpass.c
@@ -22,6 +22,7 @@ #include "aom_ports/mem.h" #include "aom_scale/aom_scale.h" #include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "av1/common/entropymv.h" #include "av1/common/quant_common.h" @@ -1106,6 +1107,7 @@ const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; + av1_alloc_src_diff_buf(cm, &cpi->td.mb); for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const tile_data = @@ -1391,7 +1393,6 @@ av1_init_mode_probs(cm->fc); av1_init_mv_probs(cm); av1_initialize_rd_consts(cpi); - av1_alloc_src_diff_buf(cm, &cpi->td.mb); enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c index 73910de..0ae4780 100644 --- a/av1/encoder/global_motion.c +++ b/av1/encoder/global_motion.c
@@ -30,83 +30,6 @@ // Border over which to compute the global motion #define ERRORADV_BORDER 0 -/* clang-format off */ -// Error metric used for global motion evaluation. -// For 8-bit input, the pixel error used to index this table will always -// be between -255 and +255. But for 10- and 12-bit input, we use interpolation -// which means that we need to support indices of -256 and +256 as well. -// Therefore, the table is offset so that logical index 0 corresponds to -// error_measure_lut[256]. -const int error_measure_lut[513] = { - // pow 0.7 - 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113, - 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749, - 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381, - 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010, - 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634, - 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254, - 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870, - 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481, - 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088, - 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689, - 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285, - 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875, - 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458, - 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036, - 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606, - 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168, - 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723, - 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268, - 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804, - 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329, - 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842, - 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341, - 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826, - 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293, - 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741, - 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164, - 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558, - 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916, - 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224, - 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461, - 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577, - 1452, 1323, 1187, 1045, 894, 731, 550, 339, - 0, 339, 550, 731, 894, 1045, 1187, 1323, - 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255, - 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041, - 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748, - 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401, - 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015, - 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599, - 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157, - 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695, - 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214, - 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718, - 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208, - 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686, - 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153, - 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610, - 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058, - 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497, - 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929, - 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353, - 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771, - 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183, - 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588, - 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988, - 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383, - 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773, - 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159, - 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539, - 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916, - 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289, - 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657, - 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022, - 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384, - 16384, -}; -/* clang-format on */ - int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) { return best_erroradvantage < erroradv_tr && best_erroradvantage * params_cost < erroradv_prod_tr; @@ -541,6 +464,11 @@ } wm->wmtype = get_wmtype(wm); + // Recompute shear params for the refined model + // This should never fail, because we only ever consider warp-able models + if (!av1_get_shear_params(wm)) { + assert(0); + } return best_error; }
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h index 8c9c60f..2645f93 100644 --- a/av1/encoder/global_motion.h +++ b/av1/encoder/global_motion.h
@@ -14,8 +14,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_scale/yv12config.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" +#include "av1/encoder/enc_enums.h" #ifdef __cplusplus extern "C" { @@ -57,11 +57,11 @@ // next_frame_to_process[i] will hold the count of next reference frame to be // processed in the direction 'i'. int8_t next_frame_to_process[MAX_DIRECTIONS]; -} JobInfo; +} GlobalMotionJobInfo; typedef struct { // Data related to assigning jobs for global motion multi-threading. - JobInfo job_info; + GlobalMotionJobInfo job_info; #if CONFIG_MULTITHREAD // Mutex lock used while dispatching jobs. @@ -97,37 +97,6 @@ int height, int *inliers, int num_inliers); -extern const int error_measure_lut[513]; - -static INLINE int error_measure(int err) { - return error_measure_lut[256 + err]; -} - -#if CONFIG_AV1_HIGHBITDEPTH -static INLINE int highbd_error_measure(int err, int bd) { - const int b = bd - 8; - const int bmask = (1 << b) - 1; - const int v = (1 << b); - - // Split error into two parts and do an interpolated table lookup - // To compute the table index and interpolation value, we want to calculate - // the quotient and remainder of err / 2^b. But it is very important that - // the division must round down, and the remainder must be positive, - // ie. in the range [0, 2^b). - // - // In C, the >> and & operators do what we want, but the / and % operators - // give the wrong results for negative inputs. So we must use >> and & here. - // - // For example, if bd == 10 and err == -5, compare the results: - // (-5) >> 2 = -2, (-5) & 3 = 3 - // vs. (-5) / 4 = -1, (-5) % 4 = -1 - const int e1 = err >> b; - const int e2 = err & bmask; - return error_measure_lut[256 + e1] * (v - e2) + - error_measure_lut[257 + e1] * e2; -} -#endif // CONFIG_AV1_HIGHBITDEPTH - int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int p_width, int p_height,
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c index 02a4e70..687eeee 100644 --- a/av1/encoder/global_motion_facade.c +++ b/av1/encoder/global_motion_facade.c
@@ -89,6 +89,7 @@ assert(ref_buf[frame] != NULL); int bit_depth = cpi->common.seq_params->bit_depth; GlobalMotionMethod global_motion_method = default_global_motion_method; + int downsample_level = cpi->sf.gm_sf.downsample_level; int num_refinements = cpi->sf.gm_sf.num_refinement_steps; bool mem_alloc_failed = false; @@ -99,9 +100,10 @@ double best_erroradv = erroradv_tr; for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE; model <= LAST_GLOBAL_TRANS_TYPE; ++model) { - if (!aom_compute_global_motion( - model, cpi->source, ref_buf[frame], bit_depth, global_motion_method, - motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { + if (!aom_compute_global_motion(model, cpi->source, ref_buf[frame], + bit_depth, global_motion_method, + downsample_level, motion_models, + RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { if (mem_alloc_failed) { aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate global motion buffers"); @@ -115,6 +117,9 @@ WarpedMotionParams tmp_wm_params; av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params); + // Check that the generated model is warp-able + if (!av1_get_shear_params(&tmp_wm_params)) continue; + // Skip models that we won't use (IDENTITY or TRANSLATION) // // For IDENTITY type models, we don't need to evaluate anything because @@ -151,6 +156,14 @@ double erroradvantage = (double)warp_error / ref_frame_error; + // Check that the model signaling cost is not too high + if (!av1_is_enough_erroradvantage( + erroradvantage, + gm_get_params_cost(&tmp_wm_params, ref_params, + cm->features.allow_high_precision_mv))) { + continue; + } + if (erroradvantage < best_erroradv) { best_erroradv = erroradvantage; // Save the wm_params modified by @@ -161,34 +174,6 @@ } } } - - if (!av1_get_shear_params(&cm->global_motion[frame])) - cm->global_motion[frame] = default_warp_params; - -#if 0 - // We never choose translational models, so this code is disabled - if (cm->global_motion[frame].wmtype == TRANSLATION) { - cm->global_motion[frame].wmmat[0] = - convert_to_trans_prec(cm->features.allow_high_precision_mv, - cm->global_motion[frame].wmmat[0]) * - GM_TRANS_ONLY_DECODE_FACTOR; - cm->global_motion[frame].wmmat[1] = - convert_to_trans_prec(cm->features.allow_high_precision_mv, - cm->global_motion[frame].wmmat[1]) * - GM_TRANS_ONLY_DECODE_FACTOR; - } -#endif - - if (cm->global_motion[frame].wmtype == IDENTITY) return; - - // If the best error advantage found doesn't meet the threshold for - // this motion type, revert to IDENTITY. - if (!av1_is_enough_erroradvantage( - best_erroradv, - gm_get_params_cost(&cm->global_motion[frame], ref_params, - cm->features.allow_high_precision_mv))) { - cm->global_motion[frame] = default_warp_params; - } } // Computes global motion for the given reference frame.
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c index 3091037..8037b59 100644 --- a/av1/encoder/hash.c +++ b/av1/encoder/hash.c
@@ -10,6 +10,7 @@ */ #include "av1/encoder/hash.h" +#include "config/av1_rtcd.h" static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, uint8_t *pData, uint32_t dataLength) {
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h index 4be2038..2390293 100644 --- a/av1/encoder/k_means_template.h +++ b/av1/encoder/k_means_template.h
@@ -24,6 +24,9 @@ #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) +#define K_MEANS_RENAME_C(x, y) x##_dim##y##_c +#define RENAME_C_(x, y) K_MEANS_RENAME_C(x, y) +#define RENAME_C(x) RENAME_C_(x, AV1_K_MEANS_DIM) // Though we want to compute the smallest L2 norm, in 1 dimension, // it is equivalent to find the smallest L1 norm and then square it. @@ -41,8 +44,8 @@ #endif } -void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids, - uint8_t *indices, int64_t *dist, int n, int k) { +void RENAME_C(av1_calc_indices)(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *dist, int n, int k) { if (dist) { *dist = 0; } @@ -149,3 +152,6 @@ } #undef RENAME_ #undef RENAME +#undef K_MEANS_RENAME_C +#undef RENAME_C_ +#undef RENAME_C
diff --git a/av1/encoder/lookahead.c b/av1/encoder/lookahead.c index 9ef9b88..476c91a 100644 --- a/av1/encoder/lookahead.c +++ b/av1/encoder/lookahead.c
@@ -46,7 +46,7 @@ unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, const int border_in_pixels, int byte_alignment, int num_lap_buffers, - bool is_all_intra, int num_pyramid_levels) { + bool is_all_intra, bool alloc_pyramid) { int lag_in_frames = AOMMAX(1, depth); // For all-intra frame encoding, previous source frames are not required. @@ -82,7 +82,7 @@ if (aom_realloc_frame_buffer( &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL, - NULL, num_pyramid_levels, 0)) { + NULL, alloc_pyramid, 0)) { goto fail; } } @@ -100,7 +100,7 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, - int num_pyramid_levels, aom_enc_frame_flags_t flags) { + bool alloc_pyramid, aom_enc_frame_flags_t flags) { int width = src->y_crop_width; int height = src->y_crop_height; int uv_width = src->uv_crop_width; @@ -124,9 +124,9 @@ height != buf->img.y_crop_height || uv_width != buf->img.uv_crop_width || uv_height != buf->img.uv_crop_height; - larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || - uv_width > buf->img.uv_width || - uv_height > buf->img.uv_height; + larger_dimensions = + width > buf->img.y_crop_width || height > buf->img.y_crop_height || + uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height; assert(!larger_dimensions || new_dimensions); if (larger_dimensions) { @@ -134,11 +134,15 @@ memset(&new_img, 0, sizeof(new_img)); if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, subsampling_y, use_highbitdepth, - AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0)) + AOM_BORDER_IN_PIXELS, 0, alloc_pyramid, 0)) return 1; aom_free_frame_buffer(&buf->img); buf->img = new_img; } else if (new_dimensions) { + buf->img.y_width = src->y_width; + buf->img.y_height = src->y_height; + buf->img.uv_width = src->uv_width; + buf->img.uv_height = src->uv_height; buf->img.y_crop_width = src->y_crop_width; buf->img.y_crop_height = src->y_crop_height; buf->img.uv_crop_width = src->uv_crop_width; @@ -146,7 +150,6 @@ buf->img.subsampling_x = src->subsampling_x; buf->img.subsampling_y = src->subsampling_y; } - // Partial copy not implemented yet av1_copy_and_extend_frame(src, &buf->img); buf->ts_start = ts_start;
diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h index c0e6d22..41eca87 100644 --- a/av1/encoder/lookahead.h +++ b/av1/encoder/lookahead.h
@@ -70,7 +70,7 @@ unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, const int border_in_pixels, int byte_alignment, int num_lap_buffers, - bool is_all_intra, int num_pyramid_levels); + bool is_all_intra, bool alloc_pyramid); /**\brief Destroys the lookahead stage */ @@ -85,18 +85,18 @@ * This function will copy the source image into a new framebuffer with * the expected stride/border. * - * \param[in] ctx Pointer to the lookahead context - * \param[in] src Pointer to the image to enqueue - * \param[in] ts_start Timestamp for the start of this frame - * \param[in] ts_end Timestamp for the end of this frame - * \param[in] use_highbitdepth Tell if HBD is used - * \param[in] num_pyramid_levels Number of pyramid levels to allocate - for each frame buffer - * \param[in] flags Flags set on this frame + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] use_highbitdepth Tell if HBD is used + * \param[in] alloc_pyramid Whether to allocate a downsampling pyramid + * for each frame buffer + * \param[in] flags Flags set on this frame */ int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, - int num_pyramid_levels, aom_enc_frame_flags_t flags); + bool alloc_pyramid, aom_enc_frame_flags_t flags); /**\brief Get the next source buffer to encode *
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c index 2462f1b..f3a9828 100644 --- a/av1/encoder/mcomp.c +++ b/av1/encoder/mcomp.c
@@ -2153,7 +2153,7 @@ aom_free(vbuf); aom_free(src_hbuf); aom_free(src_vbuf); - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf"); }
diff --git a/av1/encoder/nonrd_opt.c b/av1/encoder/nonrd_opt.c index 651ca43..e3589da 100644 --- a/av1/encoder/nonrd_opt.c +++ b/av1/encoder/nonrd_opt.c
@@ -10,6 +10,7 @@ */ #include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/reconinter.h"
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c index 9be3237..317d5c7 100644 --- a/av1/encoder/nonrd_pickmode.c +++ b/av1/encoder/nonrd_pickmode.c
@@ -23,6 +23,7 @@ #include "av1/encoder/model_rd.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/palette.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/var_based_part.h" @@ -577,7 +578,7 @@ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; const uint32_t dc_quant = p->dequant_QTX[0]; const uint32_t ac_quant = p->dequant_QTX[1]; - const int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t dc_thr = dc_quant * dc_quant >> 6; int64_t ac_thr = ac_quant * ac_quant >> 6; const int bw = b_width_log2_lookup[bsize]; const int bh = b_height_log2_lookup[bsize]; @@ -597,6 +598,11 @@ #endif + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + dc_thr = dc_thr << 1; + ac_thr = ac_thr << 2; + } + for (int k = 0; k < num_blk; k++) { // Check if all ac coefficients can be quantized to zero. if (!(var_tx[k] < ac_thr || var == 0)) { @@ -626,10 +632,12 @@ const BLOCK_SIZE uv_bsize = get_plane_block_size( bsize, puvd->subsampling_x, puvd->subsampling_y); // Adjust these thresholds for UV. + const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3; + const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3; const int64_t uv_dc_thr = - (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3; + (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc; const int64_t uv_ac_thr = - (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3; + (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, plane, plane); var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, @@ -1641,6 +1649,50 @@ } } + const unsigned int thresh_sad = + cpi->sf.rt_sf.prune_palette_search_nonrd > 1 ? 100 : 20; + const unsigned int best_sad_norm = + args.best_sad >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + + // Try palette if it's enabled. + bool try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mi->bsize); + if (cpi->sf.rt_sf.prune_palette_search_nonrd > 0) { + bool prune = + (!args.prune_mode_based_on_sad || best_sad_norm > thresh_sad) && + bsize <= BLOCK_16X16 && x->source_variance > 200; + try_palette &= prune; + } + if (try_palette) { + const TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const unsigned int intra_ref_frame_cost = 0; + x->color_palette_thresh = (best_sad_norm < 500) ? 32 : 64; + + // Search palette mode for Luma plane in intra frame. + av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx, + &this_rdc, best_rdc.rdcost); + // Update best mode data. + if (this_rdc.rdcost < best_rdc.rdcost) { + best_mode = DC_PRED; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + best_rdc.rate = this_rdc.rate; + best_rdc.dist = this_rdc.dist; + best_rdc.rdcost = this_rdc.rdcost; + if (!this_rdc.skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } + if (xd->tx_type_map[0] != DCT_DCT) + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } else { + av1_zero(mi->palette_mode_info); + } + } + mi->mode = best_mode; // Keep DC for UV since mode test is based on Y channel only. mi->uv_mode = UV_DC_PRED; @@ -1762,7 +1814,7 @@ x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) { - int thr = (cm->width * cm->height >= 640 * 360) ? 100 : 150; + int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150; int pred = x->pred_mv_sad[LAST_FRAME] >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); if (pred > thr) use_golden_ref_frame = 1; @@ -1879,14 +1931,17 @@ static AOM_INLINE int skip_mode_by_bsize_and_ref_frame( PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, - int extra_prune, unsigned int sse_zeromv_norm, int more_prune) { + int extra_prune, unsigned int sse_zeromv_norm, int more_prune, + int skip_nearmv) { const unsigned int thresh_skip_golden = 500; if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden && mode == NEWMV) return 1; - if (bsize == BLOCK_128X128 && mode == NEWMV) return 1; + if ((bsize == BLOCK_128X128 && mode == NEWMV) || + (skip_nearmv && mode == NEARMV)) + return 1; // Skip testing non-LAST if this flag is set. if (extra_prune) { @@ -1933,11 +1988,16 @@ return; } int shift = 3; + unsigned int source_var_thr = 50; + int uv_sad_thr = 100; if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res) shift = 4; - if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && - cpi->rc.high_source_sad) { - shift = 6; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (cpi->rc.high_source_sad) shift = 6; + if (source_sad_nonrd > kMedSad) { + source_var_thr = 1200; + uv_sad_thr = 10; + } } NOISE_LEVEL noise_level = kLow; int norm_sad = @@ -1975,7 +2035,7 @@ uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]); x->color_sensitivity[COLOR_SENS_IDX(plane)] = uv_sad > (y_sad >> shift) && norm_uv_sad > 40; - if (source_variance < 50 && norm_uv_sad > 100) + if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr) x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1; } } @@ -2345,6 +2405,22 @@ *ref_frame2 = NONE_FRAME; } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) && + (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME)) + return true; + + // Skip the mode if use reference frame mask flag is not set. + if (!search_state->use_ref_frame_mask[*ref_frame]) return true; + + // Skip mode for some modes and reference frames when + // force_zeromv_skip_for_blk flag is true. + if (x->force_zeromv_skip_for_blk && + ((!(*this_mode == NEARESTMV && + search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && + *this_mode != GLOBALMV) || + *ref_frame != LAST_FRAME)) + return true; + if (x->sb_me_block && *ref_frame == LAST_FRAME) { // We want to make sure to test the superblock MV: // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they @@ -2384,18 +2460,6 @@ mi->ref_frame[0] = *ref_frame; mi->ref_frame[1] = *ref_frame2; - // Skip the mode if use reference frame mask flag is not set. - if (!search_state->use_ref_frame_mask[*ref_frame]) return true; - - // Skip mode for some modes and reference frames when - // force_zeromv_skip_for_blk flag is true. - if (x->force_zeromv_skip_for_blk && - ((!(*this_mode == NEARESTMV && - search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && - *this_mode != GLOBALMV) || - *ref_frame != LAST_FRAME)) - return true; - // Skip compound mode based on variance of previously evaluated single // reference modes. if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred && @@ -2430,7 +2494,8 @@ return true; // For screen content: skip mode testing based on source_sad. - if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + !x->force_zeromv_skip_for_blk) { // If source_sad is computed: skip non-zero motion // check for stationary (super)blocks. Otherwise if superblock // has motion skip the modes with zero motion on last reference @@ -2450,7 +2515,9 @@ return true; } // Skip NEWMV search for flat blocks. - if (*this_mode == NEWMV && x->source_variance < 100) return true; + if (rt_sf->skip_newmv_flat_blocks_screen && *this_mode == NEWMV && + x->source_variance < 100) + return true; // Skip non-LAST for color on flat blocks. if (*ref_frame > LAST_FRAME && x->source_variance == 0 && (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || @@ -2462,7 +2529,8 @@ // properties. if (skip_mode_by_bsize_and_ref_frame( *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search, - sse_zeromv_norm, rt_sf->nonrd_aggressive_skip)) + sse_zeromv_norm, rt_sf->nonrd_aggressive_skip, + rt_sf->increase_source_sad_thresh)) return true; // Skip mode based on low temporal variance and souce sad. @@ -2940,9 +3008,9 @@ // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit // when issue 3359 is fixed. - if (cm->seq_params->bit_depth == 8 && - cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette && - !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk && + if (cm->seq_params->bit_depth == 8 && rt_sf->use_idtx_nonrd && + !skip_idtx_palette && !cpi->oxcf.txfm_cfg.use_inter_dct_only && + !x->force_zeromv_skip_for_blk && is_inter_mode(best_pickmode->best_mode) && best_pickmode->best_pred != NULL && (!rt_sf->prune_idtx_nonrd || @@ -3046,6 +3114,34 @@ } } +static AOM_INLINE bool enable_palette(AV1_COMP *cpi, bool is_mode_intra, + BLOCK_SIZE bsize, + unsigned int source_variance, + int force_zeromv_skip, + int skip_idtx_palette, + int force_palette_test) { + if (!cpi->oxcf.tool_cfg.enable_palette) return false; + if (!av1_allow_palette(cpi->common.features.allow_screen_content_tools, + bsize)) { + return false; + } + if (skip_idtx_palette) return false; + + if (cpi->sf.rt_sf.prune_palette_search_nonrd > 1 && + ((cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) || + bsize > BLOCK_16X16)) { + return false; + } + + if ((is_mode_intra || force_palette_test) && source_variance > 0 && + !force_zeromv_skip && + (cpi->rc.high_source_sad || source_variance > 300)) { + return true; + } else { + return false; + } +} + /*!\brief AV1 inter mode selection based on Non-RD optimized model. * * \ingroup nonrd_mode_search @@ -3229,7 +3325,8 @@ inter_pred_params_sr.conv_params = get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd); - x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad; + x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad || + segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !x->force_zeromv_skip_for_blk && x->content_state_sb.source_sad_nonrd != kZeroSad && @@ -3400,17 +3497,9 @@ x->content_state_sb.source_sad_nonrd != kZeroSad && !cpi->rc.high_source_sad; - int try_palette = - !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette && - av1_allow_palette(cpi->common.features.allow_screen_content_tools, - mi->bsize); - try_palette = - try_palette && - (is_mode_intra(best_pickmode->best_mode) || force_palette_test) && - x->source_variance > 0 && !x->force_zeromv_skip_for_blk && - (cpi->rc.high_source_sad || x->source_variance > 300); - - if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0; + bool try_palette = enable_palette( + cpi, is_mode_intra(best_pickmode->best_mode), bsize, x->source_variance, + x->force_zeromv_skip_for_blk, skip_idtx_palette, force_palette_test); // Perform screen content mode evaluation for non-rd handle_screen_content_mode_nonrd(
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c index 7f79e95..6ae1c6c 100644 --- a/av1/encoder/palette.c +++ b/av1/encoder/palette.c
@@ -480,7 +480,7 @@ int count; }; -int color_count_comp(const void *c1, const void *c2) { +static int color_count_comp(const void *c1, const void *c2) { const struct ColorCount *color_count1 = (const struct ColorCount *)c1; const struct ColorCount *color_count2 = (const struct ColorCount *)c2; if (color_count1->count > color_count2->count) return -1; @@ -564,7 +564,7 @@ } uint8_t *const color_map = xd->plane[0].color_index_map; - int color_thresh_palette = 64; + int color_thresh_palette = x->color_palette_thresh; // Allow for larger color_threshold for palette search, based on color, // scene_change, and block source variance. // Since palette is Y based, only allow larger threshold if block
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h index 7da863a..30886d3 100644 --- a/av1/encoder/palette.h +++ b/av1/encoder/palette.h
@@ -26,7 +26,7 @@ struct macroblock; /*!\cond */ -#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c +#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids, uint8_t *indices, int n, int k,
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c index 1e3d980..30ea7d9 100644 --- a/av1/encoder/partition_search.c +++ b/av1/encoder/partition_search.c
@@ -2144,8 +2144,9 @@ } if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); } - if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm && - !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) + if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ || + cpi->active_map.enabled) && + mbmi->skip_txfm && !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run); // TODO(Ravi/Remya): Move this copy function to a better logical place // This function will copy the best mode information from block @@ -2254,6 +2255,8 @@ const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; TxfmSearchInfo *txfm_info = &x->txfm_search_info; int i; + const int seg_skip = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); // This is only needed for real time/allintra row-mt enabled multi-threaded // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. @@ -2276,15 +2279,17 @@ } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; - x->force_zeromv_skip_for_blk = - get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); + if (!seg_skip) { + x->force_zeromv_skip_for_blk = + get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); - // Source variance may be already compute at superblock level, so no need - // to recompute, unless bsize < sb_size or source_variance is not yet set. - if (!x->force_zeromv_skip_for_blk && - (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) - x->source_variance = av1_get_perpixel_variance_facade( - cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + // Source variance may be already compute at superblock level, so no need + // to recompute, unless bsize < sb_size or source_variance is not yet set. + if (!x->force_zeromv_skip_for_blk && + (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + } // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; @@ -2305,27 +2310,27 @@ #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, nonrd_pick_inter_mode_sb_time); #endif - if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { - RD_STATS invalid_rd; - av1_invalid_rd_stats(&invalid_rd); - // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip - av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, - rd_cost, bsize, ctx, - invalid_rd.rdcost); - } else { - av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); + if (seg_skip) { + x->force_zeromv_skip_for_blk = 1; + // TODO(marpan): Consider adding a function for nonrd: + // av1_nonrd_pick_inter_mode_sb_seg_skip(), instead of setting + // x->force_zeromv_skip flag and entering av1_nonrd_pick_inter_mode_sb(). } + av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, nonrd_pick_inter_mode_sb_time); #endif } if (cpi->sf.rt_sf.skip_cdef_sb) { // cdef_strength is initialized to 1 which means skip_cdef, and is updated - // here. Check to see is skipping cdef is allowed. + // here. Check to see is skipping cdef is allowed. Never skip on slide/scene + // change, near a key frame, or when color sensitivity is set. Always allow + // cdef_skip for seg_skip = 1. const int allow_cdef_skipping = - cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && - !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || - x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]); + seg_skip || + (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && + !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])); // Find the corresponding 64x64 block. It'll be the 128x128 block if that's // the block size. @@ -2334,8 +2339,16 @@ MB_MODE_INFO **mi_sb = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); - // Do not skip if intra or new mv is picked, or color sensitivity is set. - // Never skip on slide/scene change. + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + unsigned int thresh_spatial_var = + (cpi->oxcf.speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) + ? 400 + : UINT_MAX; + // For skip_cdef_sb = 1: do not skip if allow_cdef_skipping is false or + // intra or new mv is picked, with possible conidition on spatial variance. + // For skip_cdef_sb >= 2: more aggressive mode to always skip unless + // allow_cdef_skipping is false and source_variance is non-zero. if (cpi->sf.rt_sf.skip_cdef_sb >= 2) { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && @@ -2343,7 +2356,8 @@ } else { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && allow_cdef_skipping && - !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV); + !(x->source_variance < thresh_spatial_var && + (mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV)); } // Store in the pickmode context. ctx->mic.cdef_strength = mi_sb[0]->cdef_strength; @@ -4233,6 +4247,54 @@ } } +// Returns true if either of the left and top neighbor blocks is larger than +// the current block; false otherwise. +static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk( + const MACROBLOCKD *xd, BLOCK_SIZE bsize) { + const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]); + if (xd->left_available) { + const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize; + if (block_size_high[left_bsize] * block_size_wide[left_bsize] > + cur_blk_area) + return true; + } + + if (xd->up_available) { + const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize; + if (block_size_high[above_bsize] * block_size_wide[above_bsize] > + cur_blk_area) + return true; + } + return false; +} + +static AOM_INLINE void prune_rect_part_using_none_pred_mode( + const MACROBLOCKD *xd, PartitionSearchState *part_state, + PREDICTION_MODE mode, BLOCK_SIZE bsize) { + if (mode == DC_PRED || mode == SMOOTH_PRED) { + // If the prediction mode of NONE partition is either DC_PRED or + // SMOOTH_PRED, it indicates that the current block has less variation. In + // this case, HORZ and VERT partitions are pruned if at least one of left + // and top neighbor blocks is larger than the current block. + if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) { + part_state->prune_rect_part[HORZ] = 1; + part_state->prune_rect_part[VERT] = 1; + } + } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) { + // If the prediction mode chosen by NONE partition is close to 90 degrees, + // it implies a dominant vertical pattern, and the chance of choosing a + // vertical rectangular partition is high. Hence, horizontal partition is + // pruned in these cases. + part_state->prune_rect_part[HORZ] = 1; + } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) { + // If the prediction mode chosen by NONE partition is close to 180 degrees, + // it implies a dominant horizontal pattern, and the chance of choosing a + // horizontal rectangular partition is high. Hence, vertical partition is + // pruned in these cases. + part_state->prune_rect_part[VERT] = 1; + } +} + // PARTITION_NONE search. static void none_partition_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x, @@ -4322,6 +4384,10 @@ part_search_state, best_rdc, pb_source_variance); } + + if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode) + prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state, + pc_tree->none->mic.mode, bsize); } av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); }
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c index ce06313..1d62f12 100644 --- a/av1/encoder/partition_strategy.c +++ b/av1/encoder/partition_strategy.c
@@ -1761,7 +1761,7 @@ // Decide whether to evaluate the AB partition specified by part_type based on // split and HORZ/VERT info -int evaluate_ab_partition_based_on_split( +static int evaluate_ab_partition_based_on_split( const PC_TREE *pc_tree, PARTITION_TYPE rect_part, const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1, int split_idx2) {
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c index 68b1056..eca49c0 100644 --- a/av1/encoder/pass2_strategy.c +++ b/av1/encoder/pass2_strategy.c
@@ -18,8 +18,10 @@ /*! @} - end defgroup gf_group_algo */ #include <assert.h> +#include <limits.h> #include <stdint.h> +#include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" @@ -158,28 +160,12 @@ return (int)max_bits; } -static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, - 0.80, 0.85, 0.90, - 0.95, 0.95, 0.95 }; -#define ERR_DIVISOR 96.0 -static double calc_correction_factor(double err_per_mb, int q) { - const double error_term = err_per_mb / ERR_DIVISOR; - const int index = q >> 5; - // Adjustment to power term based on qindex - const double power_term = - q_pow_term[index] + - (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); - assert(error_term >= 0.0); - return fclamp(pow(error_term, power_term), 0.05, 5.0); -} - // Based on history adjust expectations of bits per macroblock. static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { TWO_PASS *const twopass = &cpi->ppi->twopass; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Based on recent history adjust expectations of bits per macroblock. - double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0); double rate_err_factor = 1.0; const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0); const double min_fac = 1.0 - adj_limit; @@ -214,9 +200,7 @@ } int err_estimate = p_rc->rate_error_estimate; - int64_t bits_left = twopass->bits_left; int64_t total_actual_bits = p_rc->total_actual_bits; - int64_t bits_off_target = p_rc->vbr_bits_off_target; double rolling_arf_group_actual_bits = (double)twopass->rolling_arf_group_actual_bits; double rolling_arf_group_target_bits = @@ -231,10 +215,6 @@ : 0; total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits : p_rc->total_actual_bits; - bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target - : p_rc->vbr_bits_off_target; - bits_left = - simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left; rolling_arf_group_target_bits = (double)(simulate_parallel_frame ? p_rc->temp_rolling_arf_group_target_bits @@ -247,21 +227,21 @@ : p_rc->rate_error_estimate; #endif - if (p_rc->bits_off_target && total_actual_bits > 0) { - if (cpi->ppi->lap_enabled) { - rate_err_factor = rolling_arf_group_actual_bits / - DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits); + if ((p_rc->bits_off_target && total_actual_bits > 0) && + (rolling_arf_group_target_bits >= 1.0)) { + if (rolling_arf_group_actual_bits > rolling_arf_group_target_bits) { + double error_fraction = + (rolling_arf_group_actual_bits - rolling_arf_group_target_bits) / + rolling_arf_group_target_bits; + error_fraction = (error_fraction > 1.0) ? 1.0 : error_fraction; + rate_err_factor = 1.0 + error_fraction; } else { - rate_err_factor = 1.0 - ((double)(bits_off_target) / - AOMMAX(total_actual_bits, bits_left)); + double error_fraction = + (rolling_arf_group_target_bits - rolling_arf_group_actual_bits) / + rolling_arf_group_target_bits; + rate_err_factor = 1.0 - error_fraction; } - // Adjustment is damped if this is 1 pass with look ahead processing - // (as there are only ever a few frames of data) and for all but the first - // GOP in normal two pass. - if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) { - rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac); - } rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor)); } @@ -270,36 +250,42 @@ if ((rate_err_factor < 1.0 && err_estimate >= 0) || (rate_err_factor > 1.0 && err_estimate <= 0)) { twopass->bpm_factor *= rate_err_factor; - if (rate_err_tol >= 100) { - twopass->bpm_factor = - AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); - } else { - twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor)); - } + twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); } } -static int qbpm_enumerator(int rate_err_tol) { - return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75); +static const double q_div_term[(QINDEX_RANGE >> 5) + 1] = { 32.0, 40.0, 46.0, + 52.0, 56.0, 60.0, + 64.0, 68.0, 72.0 }; +#define EPMB_SCALER 1250000 +static double calc_correction_factor(double err_per_mb, int q) { + double power_term = 0.90; + const int index = q >> 5; + const double divisor = + q_div_term[index] + + (((q_div_term[index + 1] - q_div_term[index]) * (q % 32)) / 32.0); + double error_term = EPMB_SCALER * pow(err_per_mb, power_term); + return error_term / divisor; } // Similar to find_qindex_by_rate() function in ratectrl.c, but includes // calculation of a correction_factor. -static int find_qindex_by_rate_with_correction( - int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb, - double group_weight_factor, int rate_err_tol, int best_qindex, - int worst_qindex) { +static int find_qindex_by_rate_with_correction(uint64_t desired_bits_per_mb, + aom_bit_depth_t bit_depth, + double error_per_mb, + double group_weight_factor, + int best_qindex, + int worst_qindex) { assert(best_qindex <= worst_qindex); int low = best_qindex; int high = worst_qindex; while (low < high) { const int mid = (low + high) >> 1; - const double mid_factor = calc_correction_factor(error_per_mb, mid); + const double q_factor = calc_correction_factor(error_per_mb, mid); const double q = av1_convert_qindex_to_q(mid, bit_depth); - const int enumerator = qbpm_enumerator(rate_err_tol); - const int mid_bits_per_mb = - (int)((enumerator * mid_factor * group_weight_factor) / q); + const uint64_t mid_bits_per_mb = + (uint64_t)((q_factor * group_weight_factor) / q); if (mid_bits_per_mb > desired_bits_per_mb) { low = mid + 1; @@ -348,8 +334,8 @@ : cpi->common.mi_params.MBs; const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone); - const int target_norm_bits_per_mb = - (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs; + const uint64_t target_norm_bits_per_mb = + ((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs; int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); // Update bpm correction factor based on previous GOP rate error. @@ -359,8 +345,8 @@ // content at the given rate. int q = find_qindex_by_rate_with_correction( target_norm_bits_per_mb, cpi->common.seq_params->bit_depth, - av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol, - rc->best_quality, rc->worst_quality); + av_err_per_mb, cpi->ppi->twopass.bpm_factor, rc->best_quality, + rc->worst_quality); // Restriction on active max q for constrained quality mode. if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level); @@ -3417,14 +3403,16 @@ CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->ppi->twopass; - int section_target_bandwidth; + int64_t section_target_bandwidth; const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count - current_frame->frame_number); if (cpi->ppi->lap_enabled) - section_target_bandwidth = (int)rc->avg_frame_bandwidth; - else - section_target_bandwidth = (int)(twopass->bits_left / frames_left); - return section_target_bandwidth; + section_target_bandwidth = rc->avg_frame_bandwidth; + else { + section_target_bandwidth = twopass->bits_left / frames_left; + section_target_bandwidth = AOMMIN(section_target_bandwidth, INT_MAX); + } + return (int)section_target_bandwidth; } static INLINE void set_twopass_params_based_on_fp_stats( @@ -3535,12 +3523,13 @@ } // Smooth-out the noise variance so it is more stable +// Returns 0 on success, -1 on memory allocation failure. // TODO(bohanli): Use a better low-pass filter than averaging -static void smooth_filter_noise(FIRSTPASS_STATS *first_stats, - FIRSTPASS_STATS *last_stats) { +static int smooth_filter_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { int len = (int)(last_stats - first_stats); double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise)); - if (!smooth_noise) return; + if (!smooth_noise) return -1; for (int i = 0; i < len; i++) { double total_noise = 0; @@ -3565,11 +3554,13 @@ } aom_free(smooth_noise); + return 0; } // Estimate the noise variance of each frame from the first pass stats void av1_estimate_noise(FIRSTPASS_STATS *first_stats, - FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info) { FIRSTPASS_STATS *this_stats, *next_stats; double C1, C2, C3, noise; for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { @@ -3655,7 +3646,10 @@ this_stats->noise_var = (first_stats + 2)->noise_var; } - smooth_filter_noise(first_stats, last_stats); + if (smooth_filter_noise(first_stats, last_stats) == -1) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffers in smooth_filter_noise()"); + } } // Estimate correlation coefficient of each frame with its previous frame. @@ -3822,7 +3816,8 @@ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, - twopass->stats_buf_ctx->stats_in_end); + twopass->stats_buf_ctx->stats_in_end, + cpi->common.error); av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames, @@ -3996,7 +3991,7 @@ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, - twopass->stats_buf_ctx->stats_in_end); + twopass->stats_buf_ctx->stats_in_end, cpi->common.error); av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); @@ -4234,7 +4229,7 @@ int maxq_adj_limit; minq_adj_limit = (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); - maxq_adj_limit = rc->worst_quality - rc->active_worst_quality; + maxq_adj_limit = (rc->worst_quality - rc->active_worst_quality); // Undershoot if ((rc_cfg->under_shoot_pct < 100) && @@ -4246,8 +4241,9 @@ if ((pct_error >= rc_cfg->under_shoot_pct) && (p_rc->rate_error_estimate > 0)) { twopass->extend_minq += 1; + twopass->extend_maxq -= 1; } - twopass->extend_maxq -= 1; + // Overshoot } else if ((rc_cfg->over_shoot_pct < 100) && (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) { @@ -4259,18 +4255,8 @@ if ((pct_error >= rc_cfg->over_shoot_pct) && (p_rc->rate_error_estimate < 0)) { twopass->extend_maxq += 1; + twopass->extend_minq -= 1; } - twopass->extend_minq -= 1; - } else { - // Adjustment for extreme local overshoot. - // Only applies when normal adjustment above is not used (e.g. - // when threshold is set to 100). - if (rc->projected_frame_size > (2 * rc->base_frame_target) && - rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) - ++twopass->extend_maxq; - // Unwind extreme overshoot adjustment. - else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits) - --twopass->extend_maxq; } twopass->extend_minq = clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit); @@ -4285,8 +4271,9 @@ if (rc->projected_frame_size < fast_extra_thresh) { p_rc->vbr_bits_off_target_fast += fast_extra_thresh - rc->projected_frame_size; - p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast, - (4 * rc->avg_frame_bandwidth)); + p_rc->vbr_bits_off_target_fast = + AOMMIN(p_rc->vbr_bits_off_target_fast, + (4 * (int64_t)rc->avg_frame_bandwidth)); } }
diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h index ff1591c..5987a78 100644 --- a/av1/encoder/pass2_strategy.h +++ b/av1/encoder/pass2_strategy.h
@@ -137,7 +137,8 @@ void av1_mark_flashes(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats); void av1_estimate_noise(FIRSTPASS_STATS *first_stats, - FIRSTPASS_STATS *last_stats); + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info); void av1_estimate_coeff(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats);
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c index 9084d3f..ce03571 100644 --- a/av1/encoder/picklpf.c +++ b/av1/encoder/picklpf.c
@@ -27,12 +27,25 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/picklpf.h" +// AV1 loop filter applies to the whole frame according to mi_rows and mi_cols, +// which are calculated based on aligned width and aligned height, +// In addition, if super res is enabled, it copies the whole frame +// according to the aligned width and height (av1_superres_upscale()). +// So we need to copy the whole filtered region, instead of the cropped region. +// For example, input image size is: 160x90. +// Then src->y_crop_width = 160, src->y_crop_height = 90. +// The aligned frame size is: src->y_width = 160, src->y_height = 96. +// AV1 aligns frame size to a multiple of 8, if there is +// chroma subsampling, it is able to ensure the chroma is also +// an integer number of mi units. mi unit is 4x4, 8 = 4 * 2, and 2 luma mi +// units correspond to 1 chroma mi unit if there is subsampling. +// See: aom_realloc_frame_buffer() in yv12config.c. static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int plane) { switch (plane) { - case 0: aom_yv12_copy_y(src_bc, dst_bc); break; - case 1: aom_yv12_copy_u(src_bc, dst_bc); break; - case 2: aom_yv12_copy_v(src_bc, dst_bc); break; + case 0: aom_yv12_copy_y(src_bc, dst_bc, 0); break; + case 1: aom_yv12_copy_u(src_bc, dst_bc, 0); break; + case 2: aom_yv12_copy_v(src_bc, dst_bc, 0); break; default: assert(plane >= 0 && plane <= 2); break; } } @@ -244,6 +257,8 @@ inter_frame_multiplier = inter_frame_multiplier << 1; else if (cpi->rc.frame_source_sad > 50000) inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); + } else if (cpi->sf.rt_sf.use_fast_fixed_part) { + inter_frame_multiplier = inter_frame_multiplier << 1; } // These values were determined by linear fitting the result of the // searched level for 8 bit depth: @@ -311,7 +326,7 @@ &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer");
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c index 369529a..f604994 100644 --- a/av1/encoder/pickrst.c +++ b/av1/encoder/pickrst.c
@@ -1044,10 +1044,13 @@ #if CONFIG_AV1_HIGHBITDEPTH void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { + (void)dgd_avg; + (void)src_avg; int i, j, k, l; int32_t Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; @@ -1167,7 +1170,7 @@ if (abs_akj > max_abs_akj) max_abs_akj = abs_akj; } const int scale_threshold = 1 << 22; - const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5); + const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 6); const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7); const int scaler = scaler_c * scaler_A; @@ -1199,7 +1202,8 @@ // Fix vector b, update vector a static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc, - int64_t **Hc, int32_t *a, int32_t *b) { + int64_t **Hc, int32_t *a, + const int32_t *b) { int i, j; int64_t S[WIENER_WIN]; int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; @@ -1269,7 +1273,8 @@ // Fix vector a, update vector b static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc, - int64_t **Hc, int32_t *a, int32_t *b) { + int64_t **Hc, const int32_t *a, + int32_t *b) { int i, j; int64_t S[WIENER_WIN]; int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; @@ -1648,9 +1653,10 @@ // functions. Optimize intrinsics of HBD design similar to LBD (i.e., // pre-calculate d and s buffers and avoid most of the C operations). av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, - rsc->src_buffer, limits->h_start, limits->h_end, - limits->v_start, limits->v_end, rsc->dgd_stride, - rsc->src_stride, M, H, cm->seq_params->bit_depth); + rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, + H, cm->seq_params->bit_depth); } else { av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, limits->h_start, @@ -2056,7 +2062,7 @@ &cpi->trial_frame_rst, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); @@ -2070,11 +2076,17 @@ // and height aligned to multiple of 16 is considered for intrinsic purpose. rsc.dgd_avg = NULL; rsc.src_avg = NULL; -#if HAVE_AVX2 || HAVE_NEON - // The buffers allocated below are used during Wiener filter processing of low - // bitdepth path. Hence, allocate the same when Wiener filter is enabled in - // low bitdepth path. - if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { +#if HAVE_AVX2 || HAVE_SVE + // The buffers allocated below are used during Wiener filter processing. + // Hence, allocate the same when Wiener filter is enabled. Make sure to + // allocate these buffers only for the SIMD extensions that make use of them + // (i.e. AVX2 for low bitdepth and SVE for low and high bitdepth). +#if HAVE_AVX2 + bool allocate_buffers = !cpi->sf.lpf_sf.disable_wiener_filter && !highbd; +#elif HAVE_SVE + bool allocate_buffers = !cpi->sf.lpf_sf.disable_wiener_filter; +#endif + if (allocate_buffers) { const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg, @@ -2210,8 +2222,13 @@ best_luma_unit_size); } -#if HAVE_AVX || HAVE_NEON - if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { +#if HAVE_AVX2 || HAVE_SVE +#if HAVE_AVX2 + bool free_buffers = !cpi->sf.lpf_sf.disable_wiener_filter && !highbd; +#elif HAVE_SVE + bool free_buffers = !cpi->sf.lpf_sf.disable_wiener_filter; +#endif + if (free_buffers) { aom_free(cpi->pick_lr_ctxt.dgd_avg); cpi->pick_lr_ctxt.dgd_avg = NULL; }
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c index 1f1ff81..8060a8b 100644 --- a/av1/encoder/ratectrl.c +++ b/av1/encoder/ratectrl.c
@@ -12,6 +12,7 @@ #include <assert.h> #include <limits.h> #include <math.h> +#include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -29,6 +30,7 @@ #include "av1/common/seg_common.h" #include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/random.h" @@ -438,6 +440,33 @@ rc->rtc_external_ratectrl = 0; rc->frame_level_fast_extra_bits = 0; rc->use_external_qp_one_pass = 0; + rc->percent_blocks_inactive = 0; +} + +static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level, + int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 || + cpi->svc.framedrop_mode == AOM_LAYER_DROP) { + return (buffer_level <= drop_mark); + } else { + // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on + // buffer is checked on current and upper spatial layers. + for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + PRIMARY_RATE_CONTROL *lrc = &lc->p_rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark; + const int drop_mark_layer = + (int)(drop_thresh * lrc->optimal_buffer_level / 100); + if (lrc->buffer_level <= drop_mark_layer) return true; + } + } + return false; + } } int av1_rc_drop_frame(AV1_COMP *cpi) { @@ -463,18 +492,29 @@ rc->drop_count_consec >= rc->max_consec_drop)) { return 0; } else { - if (buffer_level < 0) { + SVC *svc = &cpi->svc; + // In the full_superframe framedrop mode for svc, if the previous spatial + // layer was dropped, drop the current spatial layer. + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1] && + svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP) + return 1; + // -1 is passed here for drop_mark since we are checking if + // buffer goes below 0 (<= -1). + if (check_buffer_below_thresh(cpi, buffer_level, -1)) { // Always drop if buffer is below 0. rc->drop_count_consec++; return 1; } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. - int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * - p_rc->optimal_buffer_level / 100); - if ((buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * + p_rc->optimal_buffer_level / 100); + const bool buffer_below_thresh = + check_buffer_below_thresh(cpi, buffer_level, drop_mark); + if (!buffer_below_thresh && rc->decimation_factor > 0) { --rc->decimation_factor; - } else if (buffer_level <= drop_mark && rc->decimation_factor == 0) { + } else if (buffer_below_thresh && rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { @@ -548,18 +588,19 @@ // Apply some control/clamp to QP under certain conditions. // Delay the use of the clamping for svc until after num_temporal_layers, // to make they have been set for each temporal layer. + // Check for rc->q_1/2_frame > 0 in case they have not been set due to + // dropped frames. if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 && + rc->q_1_frame > 0 && rc->q_2_frame > 0 && (!cpi->ppi->use_svc || svc->current_superframe > (unsigned int)svc->number_temporal_layers) && !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct || !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) { // If in the previous two frames we have seen both overshoot and undershoot - // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have - // not been set due to dropped frames. + // clamp Q between the two. if (rc->rc_1_frame * rc->rc_2_frame == -1 && - rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 && - rc->q_2_frame > 0 && !overshoot_buffer_low) { + rc->q_1_frame != rc->q_2_frame && !overshoot_buffer_low) { int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame), AOMMAX(rc->q_1_frame, rc->q_2_frame)); // If the previous frame had overshoot and the current q needs to @@ -1681,41 +1722,39 @@ const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; - const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; int active_best_quality = *active_best; int active_worst_quality = *active_worst; #if CONFIG_FPMT_TEST - const int simulate_parallel_frame = - cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && - cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; - int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq - : cpi->ppi->twopass.extend_minq; - int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq - : cpi->ppi->twopass.extend_maxq; #endif // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. if (cpi->oxcf.rc_cfg.mode != AOM_Q) { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + const int extend_minq = simulate_parallel_frame + ? p_rc->temp_extend_minq + : cpi->ppi->twopass.extend_minq; + const int extend_maxq = simulate_parallel_frame + ? p_rc->temp_extend_maxq + : cpi->ppi->twopass.extend_maxq; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (refresh_frame->golden_frame || is_intrl_arf_boost || refresh_frame->alt_ref_frame))) { -#if CONFIG_FPMT_TEST active_best_quality -= extend_minq; active_worst_quality += (extend_maxq / 2); -#else - active_best_quality -= cpi->ppi->twopass.extend_minq / 4; - active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2); -#endif } else { -#if CONFIG_FPMT_TEST active_best_quality -= extend_minq / 2; active_worst_quality += extend_maxq; -#else - active_best_quality -= cpi->ppi->twopass.extend_minq / 4; - active_worst_quality += cpi->ppi->twopass.extend_maxq; -#endif } +#else + (void)is_intrl_arf_boost; + active_best_quality -= cpi->ppi->twopass.extend_minq / 8; + active_worst_quality += cpi->ppi->twopass.extend_maxq / 4; +#endif } #ifndef STRICT_RC @@ -2393,6 +2432,10 @@ // otherwise the avg_source_sad can get too large and subsequent frames // may miss the scene/slide detection. if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0; + if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) { + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true; + } } int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, @@ -2508,16 +2551,17 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - int vbr_max_bits; const int MBs = av1_get_MBs(width, height); - rc->avg_frame_bandwidth = - (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate); - rc->min_frame_bandwidth = - (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100); + const double avg_frame_bandwidth = + round(oxcf->rc_cfg.target_bandwidth / cpi->framerate); + rc->avg_frame_bandwidth = (int)AOMMIN(avg_frame_bandwidth, INT_MAX); - rc->min_frame_bandwidth = - AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + int64_t vbr_min_bits = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100; + vbr_min_bits = AOMMIN(vbr_min_bits, INT_MAX); + + rc->min_frame_bandwidth = AOMMAX((int)vbr_min_bits, FRAME_OVERHEAD_BITS); // A maximum bitrate for a frame is defined. // The baseline for this aligns with HW implementations that @@ -2526,11 +2570,12 @@ // a very high rate is given on the command line or the the rate cannnot // be acheived because of a user specificed max q (e.g. when the user // specifies lossless encode. - vbr_max_bits = - (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) / - 100); + int64_t vbr_max_bits = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section / 100; + vbr_max_bits = AOMMIN(vbr_max_bits, INT_MAX); + rc->max_frame_bandwidth = - AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), (int)vbr_max_bits); av1_rc_set_gf_interval_range(cpi, rc); } @@ -2550,6 +2595,8 @@ #else int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target; #endif + int64_t frame_target = *this_frame_target; + const int stats_count = cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count @@ -2558,13 +2605,13 @@ 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number)); assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100); if (frame_window > 0) { - const int max_delta = (int)AOMMIN( - abs((int)(vbr_bits_off_target / frame_window)), - ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100); + const int64_t max_delta = + AOMMIN(llabs((vbr_bits_off_target / frame_window)), + (frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100); // vbr_bits_off_target > 0 means we have extra bits to spend // vbr_bits_off_target < 0 we are currently overshooting - *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta; + frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta; } #if CONFIG_FPMT_TEST @@ -2581,32 +2628,35 @@ p_rc->vbr_bits_off_target_fast && #endif !rc->is_src_frame_alt_ref) { - int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target); - int fast_extra_bits; + int64_t one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, frame_target); + int64_t fast_extra_bits; #if CONFIG_FPMT_TEST - fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = AOMMIN(vbr_bits_off_target_fast, one_frame_bits); fast_extra_bits = - (int)AOMMIN(fast_extra_bits, - AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8)); + AOMMIN(fast_extra_bits, + AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8)); #else + fast_extra_bits = AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits); fast_extra_bits = - (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits); - fast_extra_bits = (int)AOMMIN( - fast_extra_bits, - AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8)); + AOMMIN(fast_extra_bits, + AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8)); #endif + fast_extra_bits = AOMMIN(fast_extra_bits, INT_MAX); if (fast_extra_bits > 0) { - // Update this_frame_target only if additional bits are available from + // Update frame_target only if additional bits are available from // local undershoot. - *this_frame_target += (int)fast_extra_bits; + frame_target += fast_extra_bits; } // Store the fast_extra_bits of the frame and reduce it from // vbr_bits_off_target_fast during postencode stage. - rc->frame_level_fast_extra_bits = fast_extra_bits; + rc->frame_level_fast_extra_bits = (int)fast_extra_bits; // Retaining the condition to udpate during postencode stage since // fast_extra_bits are calculated based on vbr_bits_off_target_fast. cpi->do_update_vbr_bits_off_target_fast = 1; } + + // Clamp the target for the frame to the maximum allowed for one frame. + *this_frame_target = (int)AOMMIN(frame_target, INT_MAX); } void av1_set_target_rate(AV1_COMP *cpi, int width, int height) { @@ -2903,10 +2953,12 @@ for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0; // Set the reference frame flags. ext_flags->ref_frame_flags ^= AOM_LAST_FLAG; - ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; - ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; - if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) - ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; + if (!cpi->sf.rt_sf.force_only_last_ref) { + ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) + ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; + } const int sh = 6; // Moving index slot for last: 0 - (sh - 1). if (frame_number > 1) last_idx = ((frame_number - 1) % sh); @@ -2947,6 +2999,24 @@ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7); } +static int set_block_is_active(unsigned char *const active_map_4x4, int mi_cols, + int mi_rows, int sbi_col, int sbi_row, int sh, + int num_4x4) { + int r = sbi_row << sh; + int c = sbi_col << sh; + const int row_max = AOMMIN(num_4x4, mi_rows - r); + const int col_max = AOMMIN(num_4x4, mi_cols - c); + // Active map is set for 16x16 blocks, so only need to + // check over16x16, + for (int x = 0; x < row_max; x += 4) { + for (int y = 0; y < col_max; y += 4) { + if (active_map_4x4[(r + x) * mi_cols + (c + y)] == AM_SEGMENT_ID_ACTIVE) + return 1; + } + } + return 0; +} + /*!\brief Check for scene detection, for 1 pass real-time mode. * * Compute average source sad (temporal sad: between current source and @@ -3011,7 +3081,7 @@ } int num_zero_temp_sad = 0; uint32_t min_thresh = 10000; - if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + if (cpi->sf.rt_sf.higher_thresh_scene_detection) { min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 50000 : 100000; @@ -3049,11 +3119,26 @@ sizeof(*cpi->src_sad_blk_64x64))); } } + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const int mi_cols = mi_params->mi_cols; + const int mi_rows = mi_params->mi_rows; + int sh = (cm->seq_params->sb_size == BLOCK_128X128) ? 5 : 4; + int num_4x4 = (cm->seq_params->sb_size == BLOCK_128X128) ? 32 : 16; + unsigned char *const active_map_4x4 = cpi->active_map.map; // Avoid bottom and right border. for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) { for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { - tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, - last_src_ystride); + int block_is_active = 1; + if (cpi->active_map.enabled && rc->percent_blocks_inactive > 0) { + block_is_active = set_block_is_active(active_map_4x4, mi_cols, mi_rows, + sbi_col, sbi_row, sh, num_4x4); + } + if (block_is_active) { + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + } else { + tmp_sad = 0; + } if (cpi->src_sad_blk_64x64 != NULL) cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad; if (check_light_change) { @@ -3381,6 +3466,7 @@ svc->layer_context[layer].is_key_frame = 1; } rc->frame_number_encoded = 0; + cpi->ppi->rtc_ref.non_reference_frame = 0; } else { *frame_type = INTER_FRAME; gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE; @@ -3411,8 +3497,13 @@ } } } - // Check for scene change: for SVC check on base spatial layer only. - if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) { + if (cpi->active_map.enabled && cpi->rc.percent_blocks_inactive == 100) { + rc->frame_source_sad = 0; + rc->avg_source_sad = (3 * rc->avg_source_sad + rc->frame_source_sad) >> 2; + rc->percent_blocks_with_motion = 0; + rc->high_source_sad = 0; + } else if (cpi->sf.rt_sf.check_scene_detection && + svc->spatial_layer_id == 0) { if (rc->prev_coded_width == cm->width && rc->prev_coded_height == cm->height) { rc_scene_detection_onepass_rt(cpi, frame_input); @@ -3477,6 +3568,10 @@ } } +#define CHECK_INTER_LAYER_PRED(ref_frame) \ + ((cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) && \ + (av1_check_ref_is_low_spatial_res_super_frame(cpi, ref_frame))) + int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { AV1_COMMON *const cm = &cpi->common; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; @@ -3487,12 +3582,26 @@ int target_bits_per_mb; double q2; int enumerator; + int inter_layer_pred_on = 0; int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); - *q = (3 * cpi->rc.worst_quality + *q) >> 2; - // For screen content use the max-q set by the user to allow for less - // overshoot on slide changes. - if (is_screen_content) *q = cpi->rc.worst_quality; cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + if (cpi->svc.spatial_layer_id > 0) { + // For spatial layers: check if inter-layer (spatial) prediction is used + // (check if any reference is being used that is the lower spatial layer), + inter_layer_pred_on = CHECK_INTER_LAYER_PRED(LAST_FRAME) || + CHECK_INTER_LAYER_PRED(GOLDEN_FRAME) || + CHECK_INTER_LAYER_PRED(ALTREF_FRAME); + } + // If inter-layer prediction is on: we expect to pull up the quality from + // the lower spatial layer, so we can use a lower q. + if (cpi->svc.spatial_layer_id > 0 && inter_layer_pred_on) { + *q = (cpi->rc.worst_quality + *q) >> 1; + } else { + *q = (3 * cpi->rc.worst_quality + *q) >> 2; + // For screen content use the max-q set by the user to allow for less + // overshoot on slide changes. + if (is_screen_content) *q = cpi->rc.worst_quality; + } // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as // these parameters will affect QP selection for subsequent frames. If they // have settled down to a very different (low QP) state, then not adjusting @@ -3521,8 +3630,10 @@ rate_correction_factor; } // For temporal layers: reset the rate control parameters across all - // temporal layers. - if (cpi->svc.number_temporal_layers > 1) { + // temporal layers. Only do it for spatial enhancement layers when + // inter_layer_pred_on is not set (off). + if (cpi->svc.number_temporal_layers > 1 && + (cpi->svc.spatial_layer_id == 0 || inter_layer_pred_on == 0)) { SVC *svc = &cpi->svc; for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { int sl = svc->spatial_layer_id;
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h index 6802ad4..5121a90 100644 --- a/av1/encoder/ratectrl.h +++ b/av1/encoder/ratectrl.h
@@ -249,6 +249,9 @@ // signals if number of blocks with motion is high int percent_blocks_with_motion; + // signals percentage of 16x16 blocks that are inactive, via active_maps + int percent_blocks_inactive; + // Maximum value of source sad across all blocks of frame. uint64_t max_block_source_sad;
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c index 83e5d4f..9b96411 100644 --- a/av1/encoder/reconinter_enc.c +++ b/av1/encoder/reconinter_enc.c
@@ -157,7 +157,7 @@ get_ref_scale_factors_const(ctxt->cm, frame); xd->block_ref_scale_factors[0] = sf; - if ((!av1_is_valid_scale(sf))) + if (!av1_is_valid_scale(sf)) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions");
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c index 830d2c6..893749c 100644 --- a/av1/encoder/speed_features.c +++ b/av1/encoder/speed_features.c
@@ -514,6 +514,7 @@ sf->part_sf.prune_rectangular_split_based_on_qidx = allow_screen_content_tools ? 0 : 2; sf->part_sf.prune_rect_part_using_4x4_var_deviation = true; + sf->part_sf.prune_rect_part_using_none_pred_mode = true; sf->part_sf.prune_sub_8x8_partition_level = allow_screen_content_tools ? 0 : 1; sf->part_sf.prune_part4_search = 3; @@ -1176,6 +1177,7 @@ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->gm_sf.prune_zero_mv_with_sse = 2; + sf->gm_sf.downsample_level = 1; sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2; @@ -1236,8 +1238,6 @@ sf->fp_sf.reduce_mv_step_param = 4; - sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; - sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3; sf->part_sf.ext_partition_eval_thresh = @@ -1281,6 +1281,8 @@ sf->hl_sf.disable_extra_sc_testing = 1; sf->hl_sf.second_alt_ref_filtering = 0; + sf->gm_sf.downsample_level = 2; + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; sf->inter_sf.selective_ref_frame = 6; sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2; @@ -1453,7 +1455,27 @@ if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1; } - + // TODO(marpan): Tune settings for speed 11 video mode, + // for resolutions below 720p. + if (speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + sf->rt_sf.skip_cdef_sb = 1; + sf->rt_sf.force_only_last_ref = 1; + sf->rt_sf.selective_cdf_update = 1; + sf->rt_sf.use_nonrd_filter_search = 0; + if (is_360p_or_larger) { + sf->part_sf.fixed_partition_size = BLOCK_32X32; + sf->rt_sf.use_fast_fixed_part = 1; + sf->mv_sf.subpel_force_stop = HALF_PEL; + } + sf->rt_sf.increase_source_sad_thresh = 1; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + for (int i = 0; i < BLOCK_SIZES; ++i) { + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + } + sf->rt_sf.hybrid_intra_pickmode = 0; + } // Setting for SVC, or when the ref_frame_config control is // used to set the reference structure. if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) { @@ -1553,15 +1575,25 @@ sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80; sf->rt_sf.part_early_exit_zeromv = 1; sf->rt_sf.nonrd_aggressive_skip = 1; + sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90; + sf->rt_sf.hybrid_intra_pickmode = 0; + sf->rt_sf.dct_only_palette_nonrd = 1; + sf->rt_sf.prune_palette_search_nonrd = 1; + sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true; } if (speed >= 11) { sf->rt_sf.skip_lf_screen = 2; sf->rt_sf.skip_cdef_sb = 2; sf->rt_sf.part_early_exit_zeromv = 2; - sf->rt_sf.prune_palette_nonrd = 1; - sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + sf->rt_sf.prune_palette_search_nonrd = 2; sf->rt_sf.increase_color_thresh_palette = 0; + sf->rt_sf.prune_h_pred_using_best_mode_so_far = true; + sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true; } + sf->rt_sf.skip_encoding_non_reference_slide_change = 1; + sf->rt_sf.skip_newmv_flat_blocks_screen = 1; + sf->rt_sf.use_idtx_nonrd = 1; + sf->rt_sf.higher_thresh_scene_detection = 0; sf->rt_sf.use_nonrd_altref_frame = 0; sf->rt_sf.use_rtc_tf = 0; sf->rt_sf.use_comp_ref_nonrd = 0; @@ -1577,15 +1609,22 @@ } } if (cpi->rc.max_block_source_sad > 20000 && - cpi->rc.frame_source_sad > 100 && - cpi->rc.percent_blocks_with_motion > 1 && speed >= 6) { + cpi->rc.frame_source_sad > 100 && speed >= 6 && + (cpi->rc.percent_blocks_with_motion > 1 || + cpi->svc.last_layer_dropped[0])) { sf->mv_sf.search_method = NSTEP; sf->rt_sf.fullpel_search_step_param = 2; } + if (cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) { + sf->rt_sf.use_idtx_nonrd = 0; + sf->rt_sf.prefer_large_partition_blocks = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + sf->rt_sf.fullpel_search_step_param = 10; + } sf->rt_sf.partition_direct_merging = 0; sf->hl_sf.accurate_bit_estimate = 0; - // This feature is for nonrd_pickmode and non-svc for now. - if (sf->rt_sf.use_nonrd_pick_mode && !cpi->ppi->use_svc) + // This feature is for nonrd_pickmode. + if (sf->rt_sf.use_nonrd_pick_mode) sf->rt_sf.estimate_motion_for_var_based_partition = 1; else sf->rt_sf.estimate_motion_for_var_based_partition = 0; @@ -1600,6 +1639,18 @@ // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614. sf->rt_sf.estimate_motion_for_var_based_partition = 0; } + if (cpi->oxcf.superres_cfg.enable_superres) { + sf->rt_sf.use_rtc_tf = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + } + // rtc_tf feature allocates new source because of possible + // temporal filtering which may change the input source during encoding: + // this causes an issue on resized frames when psnr is calculated, + // so disable it here for frames that are resized (encoding width/height + // different from configured width/height). + if (is_psnr_calc_enabled(cpi) && (cpi->oxcf.frm_dim_cfg.width != cm->width || + cpi->oxcf.frm_dim_cfg.height != cm->height)) + sf->rt_sf.use_rtc_tf = 0; } // TODO(kyslov): now this is very similar to @@ -1768,6 +1819,8 @@ FLAG_EARLY_TERMINATE; sf->rt_sf.var_part_split_threshold_shift = 5; if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1; + sf->rt_sf.use_fast_fixed_part = 0; + sf->rt_sf.increase_source_sad_thresh = 0; if (speed >= 6) { sf->mv_sf.use_fullpel_costlist = 1; @@ -1940,6 +1993,7 @@ gm_sf->prune_ref_frame_for_gm_search = 0; gm_sf->prune_zero_mv_with_sse = 0; gm_sf->disable_gm_search_based_on_stats = 0; + gm_sf->downsample_level = 0; gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS; } @@ -1978,6 +2032,7 @@ part_sf->prune_ext_part_using_split_info = 0; part_sf->prune_rectangular_split_based_on_qidx = 0; part_sf->prune_rect_part_using_4x4_var_deviation = false; + part_sf->prune_rect_part_using_none_pred_mode = false; part_sf->early_term_after_none_split = 0; part_sf->ml_predict_breakout_level = 0; part_sf->prune_sub_8x8_partition_level = 0; @@ -2205,6 +2260,7 @@ rt_sf->use_nonrd_filter_search = 0; rt_sf->use_simple_rd_model = 0; rt_sf->hybrid_intra_pickmode = 0; + rt_sf->prune_palette_search_nonrd = 0; rt_sf->source_metrics_sb_nonrd = 0; rt_sf->overshoot_detection_cbr = NO_DETECTION; rt_sf->check_scene_detection = 0; @@ -2229,12 +2285,13 @@ rt_sf->var_part_split_threshold_shift = 7; rt_sf->gf_refresh_based_on_qp = 0; rt_sf->use_rtc_tf = 0; + rt_sf->use_idtx_nonrd = 0; rt_sf->prune_idtx_nonrd = 0; - rt_sf->prune_palette_nonrd = 0; rt_sf->dct_only_palette_nonrd = 0; rt_sf->part_early_exit_zeromv = 0; rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED; rt_sf->skip_lf_screen = 0; + rt_sf->thresh_active_maps_skip_lf_cdef = 100; rt_sf->sad_based_adp_altref_lag = 0; rt_sf->partition_direct_merging = 0; rt_sf->var_part_based_on_qidx = 0; @@ -2255,6 +2312,11 @@ rt_sf->enable_ref_short_signaling = false; rt_sf->check_globalmv_on_single_ref = true; rt_sf->increase_color_thresh_palette = false; + rt_sf->selective_cdf_update = 0; + rt_sf->force_only_last_ref = 0; + rt_sf->higher_thresh_scene_detection = 1; + rt_sf->skip_newmv_flat_blocks_screen = 0; + rt_sf->skip_encoding_non_reference_slide_change = 0; } static fractional_mv_step_fp
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h index 14cd874..c768ff3 100644 --- a/av1/encoder/speed_features.h +++ b/av1/encoder/speed_features.h
@@ -587,6 +587,9 @@ // GF group int disable_gm_search_based_on_stats; + // Downsampling pyramid level to use for global motion estimation + int downsample_level; + // Number of refinement steps to apply after initial model generation int num_refinement_steps; } GLOBAL_MOTION_SPEED_FEATURES; @@ -720,6 +723,28 @@ // speed feature is not applicable to speed >= 7. bool prune_rect_part_using_4x4_var_deviation; + // Prune rectangular partitions based on prediction mode chosen by NONE + // partition. + // false : no pruning + // true : prunes rectangular partition as described below + // If prediction mode chosen by NONE partition is + // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if + // at least one of the left and top neighbor blocks is larger than the + // current block. + // Directional Mode: Prunes either of the horizontal and vertical partition + // based on center angle of the prediction mode chosen by NONE partition. For + // example, vertical partition is pruned if center angle of the prediction + // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal + // direction) and vice versa. + // For allintra encode, this speed feature reduces instruction count by 5.1% + // for speed=6 with coding performance change less than 0.22%. For AVIF image + // encode, this speed feature reduces encode time by 4.44% for speed 6 on a + // typical image dataset with coding performance change less than 0.15%. + // For speed >= 7, variance-based logic is used to determine the partition + // structure instead of recursive partition search. Therefore, this speed + // feature is not applicable in such cases. + bool prune_rect_part_using_none_pred_mode; + // Terminate partition search for child partition, // when NONE and SPLIT partition rd_costs are INT64_MAX. int early_term_after_none_split; @@ -1604,6 +1629,15 @@ // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise int hybrid_intra_pickmode; + // Filter blocks by certain criteria such as SAD, source variance, such that + // fewer blocks will go through the palette search. + // For nonrd encoding path, enable this feature reduces encoding time when + // palette mode is used. Disabling it leads to better compression efficiency. + // 0: off + // 1: less aggressive pruning mode + // 2: more aggressive pruning mode + int prune_palette_search_nonrd; + // Compute variance/sse on source difference, prior to encoding superblock. int source_metrics_sb_nonrd; @@ -1654,10 +1688,24 @@ // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off. int skip_cdef_sb; + // Force selective cdf update. + int selective_cdf_update; + + // Force only single reference (LAST) for prediction. + int force_only_last_ref; + // Forces larger partition blocks in variance based partitioning for intra // frames int force_large_partition_blocks_intra; + // Use fixed partition for superblocks based on source_sad. + // 0: disabled + // 1: enabled + int use_fast_fixed_part; + + // Increase source_sad thresholds in nonrd pickmode. + int increase_source_sad_thresh; + // Skip evaluation of no split in tx size selection for merge partition int skip_tx_no_split_var_based_partition; @@ -1715,14 +1763,13 @@ // Must be off for lossless mode. int use_rtc_tf; - // Prune the use of the identity transform in nonrd_pickmode, - // used for screen content mode: only for smaller blocks - // and higher spatial variance, and when skip_txfm is not - // already set. - int prune_idtx_nonrd; + // Use of the identity transform in nonrd_pickmode, + int use_idtx_nonrd; - // Prune the use of paletter mode in nonrd pickmode. - int prune_palette_nonrd; + // Prune the use of the identity transform in nonrd_pickmode: + // only for smaller blocks and higher spatial variance, and when skip_txfm + // is not already set. + int prune_idtx_nonrd; // Force to only use dct for palette search in nonrd pickmode. int dct_only_palette_nonrd; @@ -1735,6 +1782,10 @@ // where rc->high_source_sad = 0 (no slide-changes). int skip_lf_screen; + // Threshold on the active/inactive region percent to disable + // the loopfilter and cdef. Setting to 100 disables this feature. + int thresh_active_maps_skip_lf_cdef; + // For nonrd: early exit out of variance partition that sets the // block size to superblock size, and sets mode to zeromv-last skip. // 0: disabled @@ -1859,6 +1910,15 @@ // This generally leads to better coding efficiency but with some speed loss. // Only used for screen content and for nonrd_pickmode. bool increase_color_thresh_palette; + + // Flag to indicate selecting of higher threshold for scenee change detection. + int higher_thresh_scene_detection; + + // FLag to indicate skip testing of NEWMV for flat blocks. + int skip_newmv_flat_blocks_screen; + + // Flag to force skip encoding for non_reference_frame on slide/scene changes. + int skip_encoding_non_reference_slide_change; } REAL_TIME_SPEED_FEATURES; /*!\endcond */
diff --git a/av1/encoder/superres_scale.c b/av1/encoder/superres_scale.c index 5e1e289..41225d5 100644 --- a/av1/encoder/superres_scale.c +++ b/av1/encoder/superres_scale.c
@@ -347,7 +347,8 @@ SCALE_NUMERATOR }; int resize_denom = SCALE_NUMERATOR; if (has_no_stats_stage(cpi) && cpi->ppi->use_svc && - cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) { + (cpi->common.width != cpi->oxcf.frm_dim_cfg.width || + cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) { rsz.resize_width = cpi->common.width; rsz.resize_height = cpi->common.height; return rsz; @@ -403,7 +404,7 @@ assert(!is_lossless_requested(&cpi->oxcf.rc_cfg)); assert(!cm->features.all_lossless); - av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels); + av1_superres_upscale(cm, NULL, cpi->alloc_pyramid); // If regular resizing is occurring the source will need to be downscaled to // match the upscaled superres resolution. Otherwise the original source is
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c index ae0c276..dbab1d5 100644 --- a/av1/encoder/svc_layercontext.c +++ b/av1/encoder/svc_layercontext.c
@@ -77,6 +77,8 @@ } svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; + svc->last_layer_dropped[sl] = false; + svc->drop_spatial_layer[sl] = false; } if (svc->number_spatial_layers == 3) { svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH; @@ -195,14 +197,20 @@ const double prev_layer_framerate = cpi->framerate / lcprev->framerate_factor; const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate; - lc->avg_frame_size = - (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / - (lc->framerate - prev_layer_framerate)); + if (lc->framerate > prev_layer_framerate) { + lc->avg_frame_size = + (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } else { + lc->avg_frame_size = (int)round(lc->target_bandwidth / lc->framerate); + } } } -static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame( - int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) { +bool av1_check_ref_is_low_spatial_res_super_frame(AV1_COMP *const cpi, + int ref_frame) { + SVC *svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1]; return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe && rtc_ref->buffer_spatial_layer[ref_frame_idx] <= @@ -251,13 +259,13 @@ // previous spatial layer(s) at the same time (current_superframe). if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref && cpi->sf.rt_sf.use_nonrd_pick_mode) { - if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) { + if (av1_check_ref_is_low_spatial_res_super_frame(cpi, LAST_FRAME)) { svc->skip_mvsearch_last = 1; } - if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) { + if (av1_check_ref_is_low_spatial_res_super_frame(cpi, GOLDEN_FRAME)) { svc->skip_mvsearch_gf = 1; } - if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) { + if (av1_check_ref_is_low_spatial_res_super_frame(cpi, ALTREF_FRAME)) { svc->skip_mvsearch_altref = 1; } } @@ -320,8 +328,12 @@ svc->temporal_layer_fb[i] = svc->temporal_layer_id; } } - if (svc->spatial_layer_id == svc->number_spatial_layers - 1) + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { svc->current_superframe++; + // Reset drop flag to false for next superframe. + for (int sl = 0; sl < svc->number_spatial_layers; sl++) + svc->drop_spatial_layer[sl] = false; + } } int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) { @@ -386,6 +398,11 @@ int *height_out) { int w, h; if (width_out == NULL || height_out == NULL || den == 0) return; + if (den == 1 && num == 1) { + *width_out = width_org; + *height_out = height_org; + return; + } w = width_org * num / den; h = height_org * num / den; // Make height and width even. @@ -397,6 +414,7 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; + AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *lc = NULL; int width = 0, height = 0; lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + @@ -418,13 +436,13 @@ if (width * height <= 320 * 240) svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; - cpi->common.width = width; - cpi->common.height = height; + cm->width = width; + cm->height = height; alloc_mb_mode_info_buffers(cpi); av1_update_frame_size(cpi); if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { - svc->mi_cols_full_resoln = cpi->common.mi_params.mi_cols; - svc->mi_rows_full_resoln = cpi->common.mi_params.mi_rows; + svc->mi_cols_full_resoln = cm->mi_params.mi_cols; + svc->mi_rows_full_resoln = cm->mi_params.mi_rows; } }
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h index bfde33d..d56ea77 100644 --- a/av1/encoder/svc_layercontext.h +++ b/av1/encoder/svc_layercontext.h
@@ -147,6 +147,23 @@ * different/lower bitrate. */ int has_lower_quality_layer; + + /*! + * Flag to indicate the frame drop mode for SVC: one of the two settings: + * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP. + */ + AOM_SVC_FRAME_DROP_MODE framedrop_mode; + + /*! + * Flag to indicate if frame was dropped for a given spatial_layer_id on + * previous superframe. + */ + bool last_layer_dropped[AOM_MAX_SS_LAYERS]; + + /*! + * Flag to indicate if a previous spatial was dropped for the same superframe. + */ + bool drop_spatial_layer[AOM_MAX_SS_LAYERS]; } SVC; struct AV1_COMP; @@ -206,6 +223,21 @@ */ void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi); +/*!\brief Prior to check if reference is lower spatial layer at the same + * timestamp/superframe. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] ref_frame Reference frame + * + * \return True if the ref_frame if lower spatial layer, otherwise false. + */ +bool av1_check_ref_is_low_spatial_res_super_frame(struct AV1_COMP *const cpi, + int ref_frame); + /*!\brief Prior to encoding the frame, set the layer context, for the current layer to be encoded, to the cpi struct. *
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c index d6ae667..e8cc145 100644 --- a/av1/encoder/temporal_filter.c +++ b/av1/encoder/temporal_filter.c
@@ -463,12 +463,12 @@ // Returns: // Nothing will be returned. But the content to which `accum` and `pred` // point will be modified. -void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame, - const MACROBLOCKD *mbd, - const BLOCK_SIZE block_size, - const int mb_row, const int mb_col, - const int num_planes, uint32_t *accum, - uint16_t *count) { +static void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame, + const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, + const int mb_row, const int mb_col, + const int num_planes, uint32_t *accum, + uint16_t *count) { // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; @@ -564,9 +564,10 @@ // Returns: // Nothing will be returned. But the content to which `luma_sse_sum` points // will be modified. -void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum, - int block_height, int block_width, - int ss_x_shift, int ss_y_shift) { +static void compute_luma_sq_error_sum(uint32_t *square_diff, + uint32_t *luma_sse_sum, int block_height, + int block_width, int ss_x_shift, + int ss_y_shift) { for (int i = 0; i < block_height; ++i) { for (int j = 0; j < block_width; ++j) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { @@ -1443,26 +1444,24 @@ return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1; } -void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf); - if (tf_info->is_temporal_filter_on == 0) return; + if (tf_info->is_temporal_filter_on == 0) return true; const AV1_COMMON *cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; - int ret; for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { - ret = aom_realloc_frame_buffer( - &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, - seq_params->subsampling_x, seq_params->subsampling_y, - seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, NULL, NULL, NULL, - cpi->image_pyramid_levels, 0); - if (ret) { - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, - "Failed to allocate tf_info"); + if (aom_realloc_frame_buffer( + &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->alloc_pyramid, 0)) { + return false; } } + return true; } void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h index 0b00c88..a40fb03 100644 --- a/av1/encoder/temporal_filter.h +++ b/av1/encoder/temporal_filter.h
@@ -14,6 +14,8 @@ #include <stdbool.h> +#include "aom_util/aom_pthread.h" + #ifdef __cplusplus extern "C" { #endif @@ -204,8 +206,10 @@ /*!\brief Allocate buffers for TEMPORAL_FILTER_INFO * \param[in,out] tf_info Temporal filter info for a gop * \param[in,out] cpi Top level encoder instance structure + * + * \return True on success, false on memory allocation failure. */ -void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const struct AV1_COMP *cpi); /*!\brief Free buffers for TEMPORAL_FILTER_INFO
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c index ca60e49..86f5485 100644 --- a/av1/encoder/tpl_model.c +++ b/av1/encoder/tpl_model.c
@@ -19,6 +19,7 @@ #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" +#include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/enums.h" @@ -193,7 +194,7 @@ &tpl_data->tpl_rec_pool[frame], width, height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, tpl_data->border_in_pixels, - byte_alignment, 0, alloc_y_plane_only)) + byte_alignment, false, alloc_y_plane_only)) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); }
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h index bcd5821..0150c70 100644 --- a/av1/encoder/tpl_model.h +++ b/av1/encoder/tpl_model.h
@@ -30,6 +30,7 @@ #include "config/aom_config.h" #include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "av1/common/mv.h" #include "av1/common/scale.h"
diff --git a/av1/encoder/tune_butteraugli.c b/av1/encoder/tune_butteraugli.c index 92fc4b2..4381af6 100644 --- a/av1/encoder/tune_butteraugli.c +++ b/av1/encoder/tune_butteraugli.c
@@ -209,7 +209,7 @@ if (dst->buffer_alloc_sz == 0) { aom_alloc_frame_buffer( dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); } av1_copy_and_extend_frame(cpi->source, dst); @@ -218,7 +218,7 @@ aom_alloc_frame_buffer( resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); } if (!av1_resize_and_extend_frame_nonnormative( cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) { @@ -244,7 +244,7 @@ aom_alloc_frame_buffer( &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor, height / resize_factor); @@ -267,12 +267,12 @@ cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, - 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } av1_setup_butteraugli_source(cpi);
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c index 4e5ffa3..fdb7c77 100644 --- a/av1/encoder/tune_vmaf.c +++ b/av1/encoder/tune_vmaf.c
@@ -247,7 +247,9 @@ // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, // all co-efficients must be even. -DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, +// The array is of size 9 to allow passing gauss_filter + 1 to +// _mm_loadu_si128() in prepare_coeffs_6t(). +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0, 8, 30, 52, 30, 8, 0, 0 }; static AOM_INLINE void gaussian_blur(const int bit_depth, const YV12_BUFFER_CONFIG *source, @@ -288,10 +290,10 @@ } } -static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi, - double source_variance, - YV12_BUFFER_CONFIG *const source, - YV12_BUFFER_CONFIG *const sharpened) { +static AOM_INLINE double cal_approx_vmaf( + const AV1_COMP *const cpi, double source_variance, + const YV12_BUFFER_CONFIG *const source, + const YV12_BUFFER_CONFIG *const sharpened) { const int bit_depth = cpi->td.mb.e_mbd.bd; const bool cal_vmaf_neg = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; @@ -305,11 +307,11 @@ } static double find_best_frame_unsharp_amount_loop( - const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source, - YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened, - double best_vmaf, const double baseline_variance, - const double unsharp_amount_start, const double step_size, - const int max_loop_count, const double max_amount) { + const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source, + const YV12_BUFFER_CONFIG *const blurred, + const YV12_BUFFER_CONFIG *const sharpened, double best_vmaf, + const double baseline_variance, const double unsharp_amount_start, + const double step_size, const int max_loop_count, const double max_amount) { const double min_amount = 0.0; int loop_count = 0; double approx_vmaf = best_vmaf; @@ -328,13 +330,11 @@ return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); } -static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source, - YV12_BUFFER_CONFIG *const blurred, - const double unsharp_amount_start, - const double step_size, - const int max_loop_count, - const double max_filter_amount) { +static double find_best_frame_unsharp_amount( + const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source, + const YV12_BUFFER_CONFIG *const blurred, const double unsharp_amount_start, + const double step_size, const int max_loop_count, + const double max_filter_amount) { const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; @@ -343,7 +343,7 @@ aom_alloc_frame_buffer( &sharpened, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); const double baseline_variance = frame_average_variance(cpi, source); double unsharp_amount; @@ -376,7 +376,7 @@ } void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source) { + const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_width; @@ -395,7 +395,7 @@ aom_alloc_frame_buffer( &blurred, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, source, &blurred); unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); @@ -403,7 +403,7 @@ } void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source) { + const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_width; @@ -415,11 +415,11 @@ aom_alloc_frame_buffer( &source_extended, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer( &blurred, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); @@ -442,7 +442,7 @@ } void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source) { + const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; @@ -455,11 +455,11 @@ memset(&source_extended, 0, sizeof(source_extended)); aom_alloc_frame_buffer( &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); @@ -495,11 +495,11 @@ aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { @@ -622,7 +622,7 @@ aom_alloc_frame_buffer( &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); if (!av1_resize_and_extend_frame_nonnormative( cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, @@ -643,7 +643,7 @@ aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, &resized_source, &blurred); YV12_BUFFER_CONFIG recon; @@ -651,7 +651,7 @@ aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_yv12_copy_frame(&resized_source, &recon, 1); VmafContext *vmaf_context; @@ -830,15 +830,15 @@ aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, cur, &blurred_cur); gaussian_blur(bit_depth, last, &blurred_last); @@ -881,8 +881,8 @@ } static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi, - YV12_BUFFER_CONFIG **last, - YV12_BUFFER_CONFIG **next) { + const YV12_BUFFER_CONFIG **last, + const YV12_BUFFER_CONFIG **next) { const AV1_COMMON *const cm = &cpi->common; const GF_GROUP *gf_group = &cpi->ppi->gf_group; const int src_index = @@ -920,7 +920,7 @@ if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) { return current_qindex; } - YV12_BUFFER_CONFIG *cur_buf = cpi->source; + const YV12_BUFFER_CONFIG *cur_buf = cpi->source; if (cm->show_frame == 0) { const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; struct lookahead_entry *cur_entry = av1_lookahead_peek( @@ -929,7 +929,7 @@ } assert(cur_buf); - YV12_BUFFER_CONFIG *next_buf, *last_buf; + const YV12_BUFFER_CONFIG *next_buf, *last_buf; get_neighbor_frames(cpi, &last_buf, &next_buf); assert(last_buf); @@ -954,8 +954,8 @@ static AOM_INLINE double cal_approx_score( AV1_COMP *const cpi, double src_variance, double new_variance, - double src_score, YV12_BUFFER_CONFIG *const src, - YV12_BUFFER_CONFIG *const recon_sharpened) { + double src_score, const YV12_BUFFER_CONFIG *const src, + const YV12_BUFFER_CONFIG *const recon_sharpened) { double score; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; const bool cal_vmaf_neg = @@ -967,11 +967,12 @@ static double find_best_frame_unsharp_amount_loop_neg( AV1_COMP *const cpi, double src_variance, double base_score, - YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon, - YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred, - YV12_BUFFER_CONFIG *const recon_blurred, - YV12_BUFFER_CONFIG *const src_sharpened, - YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, + const YV12_BUFFER_CONFIG *const src, const YV12_BUFFER_CONFIG *const recon, + const YV12_BUFFER_CONFIG *const ref, + const YV12_BUFFER_CONFIG *const src_blurred, + const YV12_BUFFER_CONFIG *const recon_blurred, + const YV12_BUFFER_CONFIG *const src_sharpened, + const YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, double best_score, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_amount) { const double min_amount = 0.0; @@ -999,8 +1000,8 @@ } static double find_best_frame_unsharp_amount_neg( - AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src, - YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref, + AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const src, + const YV12_BUFFER_CONFIG *const recon, const YV12_BUFFER_CONFIG *const ref, double base_score, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_filter_amount) { @@ -1023,18 +1024,18 @@ aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer( &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, recon, &recon_blurred); gaussian_blur(bit_depth, src, &src_blurred); @@ -1076,8 +1077,8 @@ } void av1_update_vmaf_curve(AV1_COMP *cpi) { - YV12_BUFFER_CONFIG *source = cpi->source; - YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + const YV12_BUFFER_CONFIG *source = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; const int bit_depth = cpi->td.mb.e_mbd.bd; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = @@ -1099,7 +1100,7 @@ } if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { - YV12_BUFFER_CONFIG *last, *next; + const YV12_BUFFER_CONFIG *last, *next; get_neighbor_frames(cpi, &last, &next); double best_unsharp_amount_start = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
diff --git a/av1/encoder/tune_vmaf.h b/av1/encoder/tune_vmaf.h index a04a29e..404fd10 100644 --- a/av1/encoder/tune_vmaf.h +++ b/av1/encoder/tune_vmaf.h
@@ -43,13 +43,13 @@ struct AV1_COMP; void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi, - YV12_BUFFER_CONFIG *source); + const YV12_BUFFER_CONFIG *source); void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi, - YV12_BUFFER_CONFIG *source); + const YV12_BUFFER_CONFIG *source); void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi, - YV12_BUFFER_CONFIG *source); + const YV12_BUFFER_CONFIG *source); void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi);
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c index 7292c01..5dcc08c 100644 --- a/av1/encoder/tx_search.c +++ b/av1/encoder/tx_search.c
@@ -1109,13 +1109,11 @@ *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); } -uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - int block, TX_SIZE tx_size, int blk_row, - int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, - int16_t allowed_tx_mask, int prune_factor, - const TXB_CTX *const txb_ctx, - int reduced_tx_set_used, int64_t ref_best_rd, - int num_sel) { +static uint16_t prune_txk_type_separ( + const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, + int16_t allowed_tx_mask, int prune_factor, const TXB_CTX *const txb_ctx, + int reduced_tx_set_used, int64_t ref_best_rd, int num_sel) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; @@ -1255,11 +1253,12 @@ return prune; } -uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - int block, TX_SIZE tx_size, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, int *txk_map, - uint16_t allowed_tx_mask, int prune_factor, - const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { +static uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + int *txk_map, uint16_t allowed_tx_mask, + int prune_factor, const TXB_CTX *const txb_ctx, + int reduced_tx_set_used) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; int tx_type;
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c index 5505db2..2c9772d 100644 --- a/av1/encoder/var_based_part.c +++ b/av1/encoder/var_based_part.c
@@ -27,6 +27,7 @@ #include "av1/common/blockd.h" #include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/var_based_part.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/rdopt_utils.h" @@ -1109,8 +1110,8 @@ static void fill_variance_tree_leaves( AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split, int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4], - int *variance4x4downsample, int64_t *thresholds, const uint8_t *src_buf, - int src_stride, const uint8_t *dst_buf, int dst_stride, bool is_key_frame, + int64_t *thresholds, const uint8_t *src_buf, int src_stride, + const uint8_t *dst_buf, int dst_stride, bool is_key_frame, const bool is_small_sb) { MACROBLOCKD *xd = &x->e_mbd; const int num_64x64_blocks = is_small_sb ? 1 : 4; @@ -1157,11 +1158,8 @@ const int split_index = 21 + lvl1_scale_idx + lvl2_idx; VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; force_split[split_index] = PART_EVAL_ALL; - variance4x4downsample[lvl1_scale_idx + lvl2_idx] = 0; if (is_key_frame) { - force_split[split_index] = PART_EVAL_ALL; // Go down to 4x4 down-sampling for variance. - variance4x4downsample[lvl1_scale_idx + lvl2_idx] = 1; for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3); const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3); @@ -1347,6 +1345,8 @@ AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int num_planes = av1_num_planes(cm); + bool scaled_ref_golden = false; + bool scaled_ref_alt = false; BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; MB_MODE_INFO *mi = xd->mi[0]; const YV12_BUFFER_CONFIG *yv12 = @@ -1364,21 +1364,22 @@ cpi->sf.rt_sf.use_nonrd_altref_frame || (cpi->sf.rt_sf.use_comp_ref_nonrd && cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1); - // On a resized frame (reference has different scale) only use - // LAST as reference for partitioning for now. - if (scaled_ref_last) { - use_golden_ref = 0; - use_alt_ref = 0; - } // For 1 spatial layer: GOLDEN is another temporal reference. // Check if it should be used as reference for partitioning. if (cpi->svc.number_spatial_layers == 1 && use_golden_ref && (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (yv12_g && (yv12_g->y_crop_height != cm->height || + yv12_g->y_crop_width != cm->width)) { + yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + scaled_ref_golden = true; + } if (yv12_g && yv12_g != yv12) { - av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, - get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); + av1_setup_pre_planes( + xd, 0, yv12_g, mi_row, mi_col, + scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME), + num_planes); *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, @@ -1392,9 +1393,16 @@ (cpi->ref_frame_flags & AOM_ALT_FLAG) && (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + if (yv12_alt && (yv12_alt->y_crop_height != cm->height || + yv12_alt->y_crop_width != cm->width)) { + yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME); + scaled_ref_alt = true; + } if (yv12_alt && yv12_alt != yv12) { - av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col, - get_ref_scale_factors(cm, ALTREF_FRAME), num_planes); + av1_setup_pre_planes( + xd, 0, yv12_alt, mi_row, mi_col, + scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME), + num_planes); *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, @@ -1518,7 +1526,9 @@ int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) { if (set_zeromv_skip_based_on_source_sad == 0) return false; - if (set_zeromv_skip_based_on_source_sad >= 2) + if (set_zeromv_skip_based_on_source_sad >= 3) + return source_sad_nonrd <= kLowSad; + else if (set_zeromv_skip_based_on_source_sad >= 2) return source_sad_nonrd <= kVeryLowSad; else if (set_zeromv_skip_based_on_source_sad >= 1) return source_sad_nonrd == kZeroSad; @@ -1527,20 +1537,21 @@ } static AOM_INLINE bool set_force_zeromv_skip_for_sb( - AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP16x16 *vt2, - VP128x128 *vt, unsigned int *uv_sad, int mi_row, int mi_col, - unsigned int y_sad, BLOCK_SIZE bsize) { + AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt, + unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad, + BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; if (!is_set_force_zeromv_skip_based_on_src_sad( cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad, x->content_state_sb.source_sad_nonrd)) return false; + int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0; const int block_width = mi_size_wide[cm->seq_params->sb_size]; const int block_height = mi_size_high[cm->seq_params->sb_size]; const unsigned int thresh_exit_part_y = - cpi->zeromv_skip_thresh_exit_part[bsize]; + cpi->zeromv_skip_thresh_exit_part[bsize] << shift; unsigned int thresh_exit_part_uv = - CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y); + CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift; // Be more aggressive in UV threshold if source_sad >= VeryLowSad // to suppreess visual artifact caused by the speed feature: // set_zeromv_skip_based_on_source_sad = 2. For now only for @@ -1553,7 +1564,6 @@ uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) { set_block_size(cpi, mi_row, mi_col, bsize); x->force_zeromv_skip_for_sb = 1; - aom_free(vt2); aom_free(vt); // Partition shape is set here at SB level. // Exit needs to happen from av1_choose_var_based_partitioning(). @@ -1573,8 +1583,6 @@ AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds; - VP128x128 *vt; - VP16x16 *vt2 = NULL; PART_EVAL_STATUS force_split[85]; int avg_64x64; int max_var_32x32[4]; @@ -1586,7 +1594,6 @@ int avg_16x16[4][4]; int maxvar_16x16[4][4]; int minvar_16x16[4][4]; - int64_t threshold_4x4avg; const uint8_t *src_buf; const uint8_t *dst_buf; int dst_stride; @@ -1614,19 +1621,24 @@ unsigned int y_sad_last = UINT_MAX; BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + // Force skip encoding for all superblocks on slide change for + // non_reference_frames. + if (cpi->sf.rt_sf.skip_encoding_non_reference_slide_change && + cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) { + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + av1_set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); + x->force_zeromv_skip_for_sb = 1; + return 0; + } + // Ref frame used in partitioning. MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; - AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); - - vt->split = td->vt64x64; - int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1], vbp_thresholds[2], vbp_thresholds[3], vbp_thresholds[4] }; - const int low_res = (cm->width <= 352 && cm->height <= 288); - int variance4x4downsample[64]; const int segment_id = xd->mi[0]->segment_id; uint64_t blk_sad = 0; if (cpi->src_sad_blk_64x64 != NULL && @@ -1653,9 +1665,6 @@ x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd, is_segment_id_boosted, x->content_state_sb.lighting_change); - // For non keyframes, disable 4x4 average for low resolution when speed = 8 - threshold_4x4avg = INT64_MAX; - src_buf = x->plane[AOM_PLANE_Y].src.buf; int src_stride = x->plane[AOM_PLANE_Y].src.stride; @@ -1720,6 +1729,10 @@ x->force_zeromv_skip_for_sb = 0; + VP128x128 *vt; + AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); + vt->split = td->vt64x64; + // If the superblock is completely static (zero source sad) and // the y_sad (relative to LAST ref) is very small, take the sb_size partition // and exit, and force zeromv_last skip mode for nonrd_pickmode. @@ -1730,28 +1743,19 @@ cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE && ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) { // Exit here, if zero mv skip flag is set at SB level. - if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt2, vt, uv_sad, mi_row, - mi_col, y_sad, bsize)) + if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col, + y_sad, bsize)) return 0; } if (cpi->noise_estimate.enabled) noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); - if (low_res && threshold_4x4avg < INT64_MAX) { - vt2 = aom_malloc(sizeof(*vt2)); - if (!vt2) { - aom_free(vt); - aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, - "Error allocating partition buffer vt2"); - } - } - // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances - // for splits. + // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames) + // variances for splits. fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16, - minvar_16x16, variance4x4downsample, thresholds, - src_buf, src_stride, dst_buf, dst_stride, - is_key_frame, is_small_sb); + minvar_16x16, thresholds, src_buf, src_stride, + dst_buf, dst_stride, is_key_frame, is_small_sb); avg_64x64 = 0; for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { @@ -1761,11 +1765,8 @@ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { - if (variance4x4downsample[lvl1_scale_idx + lvl2_idx] != 1) continue; - VP16x16 *vtemp = - (!is_key_frame) - ? &vt2[lvl1_scale_idx + lvl2_idx] - : &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + if (!is_key_frame) continue; + VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8); fill_variance_tree(vtemp, BLOCK_16X16); @@ -1892,14 +1893,8 @@ const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2); const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2); const int split_index = 21 + lvl1_scale_idx + lvl2_idx; - // For inter frames: if variance4x4downsample[] == 1 for this - // 16x16 block, then the variance is based on 4x4 down-sampling, - // so use vt2 in set_vt_partioning(), otherwise use vt. VP16x16 *vtemp = - (!is_key_frame && - variance4x4downsample[lvl1_scale_idx + lvl2_idx] == 1) - ? &vt2[lvl1_scale_idx + lvl2_idx] - : &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16, mi_row + y64_idx + y32_idx + y16_idx, mi_col + x64_idx + x32_idx + x16_idx, @@ -1923,7 +1918,6 @@ ref_frame_partition, mi_col, mi_row, is_small_sb); } - aom_free(vt2); aom_free(vt); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, choose_var_based_partitioning_time);
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c index a4def75..31cc37d 100644 --- a/av1/encoder/x86/av1_fwd_txfm_sse2.c +++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -2638,6 +2638,11 @@ } } +// Include top-level function only for 32-bit x86, to support Valgrind. +// For normal use, we require SSE4.1, so av1_lowbd_fwd_txfm_sse4_1 will be used +// instead of this function. However, 32-bit Valgrind does not support SSE4.1, +// so we include a fallback to SSE2 to improve performance +#if AOM_ARCH_X86 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform @@ -2671,3 +2676,4 @@ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } +#endif // AOM_ARCH_X86
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c index 40b3b46..f3a0b15 100644 --- a/av1/encoder/x86/av1_highbd_quantize_sse4.c +++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -138,8 +138,9 @@ const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); qparam[0] = _mm_set_epi32(round1, round1, round1, round0); - qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]); - qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); + qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]); + qparam[2] = + _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]); qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], dequant_ptr[0]); @@ -149,8 +150,8 @@ // update round/quan/dquan for AC qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); - qparam[1] = xx_set1_64_from_32i(quant_ptr[1]); - qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]); + qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]); + qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]); qparam[3] = _mm_set1_epi32(dequant_ptr[1]); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr, dquanAddr);
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c index ad0b374..52ddc66 100644 --- a/av1/encoder/x86/av1_k_means_avx2.c +++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -10,7 +10,7 @@ */ #include <immintrin.h> // AVX2 -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" static int64_t k_means_horizontal_sum_avx2(__m256i a) {
diff --git a/av1/encoder/x86/av1_k_means_sse2.c b/av1/encoder/x86/av1_k_means_sse2.c index 4338bf7..6c75822 100644 --- a/av1/encoder/x86/av1_k_means_sse2.c +++ b/av1/encoder/x86/av1_k_means_sse2.c
@@ -11,7 +11,7 @@ #include <emmintrin.h> // SSE2 -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" static int64_t k_means_horizontal_sum_sse2(__m128i a) {
diff --git a/av1/encoder/x86/cnn_avx2.c b/av1/encoder/x86/cnn_avx2.c index ee93b3d..9c26a56 100644 --- a/av1/encoder/x86/cnn_avx2.c +++ b/av1/encoder/x86/cnn_avx2.c
@@ -466,7 +466,7 @@ // As per the layer config set by av1_intra_mode_cnn_partition_cnn_config, // the filter_width and filter_height are equal to 2 for layer >= 1. So // convolution happens at 2x2 for layer >= 1. -void cnn_convolve_no_maxpool_padding_valid_2x2_avx2( +static void cnn_convolve_no_maxpool_padding_valid_2x2_avx2( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int channel_step) {
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c index 57725d1..f180c94 100644 --- a/av1/encoder/x86/error_intrin_avx2.c +++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -29,9 +29,9 @@ } } -static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff, - const int16_t *dqcoeff, - __m256i *sse_256) { +static INLINE void av1_block_error_block_size16_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256) { const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff); const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff); // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 @@ -44,9 +44,9 @@ *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256()); } -static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff, - const int16_t *dqcoeff, - __m256i *sse_256) { +static INLINE void av1_block_error_block_size32_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256) { const __m256i zero = _mm256_setzero_si256(); const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); @@ -71,12 +71,12 @@ *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0); } -static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff, - const int16_t *dqcoeff, - __m256i *sse_256, - intptr_t num_coeff) { +static INLINE void av1_block_error_block_size64_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256, + intptr_t block_size) { const __m256i zero = _mm256_setzero_si256(); - for (int i = 0; i < num_coeff; i += 64) { + for (int i = 0; i < block_size; i += 64) { // Load 64 elements for coeff and dqcoeff. const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); @@ -126,17 +126,17 @@ } int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff, - intptr_t num_coeff) { - assert(num_coeff % 16 == 0); + intptr_t block_size) { + assert(block_size % 16 == 0); __m256i sse_256 = _mm256_setzero_si256(); int64_t sse; - if (num_coeff == 16) - av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256); - else if (num_coeff == 32) - av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256); + if (block_size == 16) + av1_block_error_block_size16_avx2(coeff, dqcoeff, &sse_256); + else if (block_size == 32) + av1_block_error_block_size32_avx2(coeff, dqcoeff, &sse_256); else - av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff); + av1_block_error_block_size64_avx2(coeff, dqcoeff, &sse_256, block_size); // Save the higher 64 bit of each 128 bit lane. const __m256i sse_hi = _mm256_srli_si256(sse_256, 8);
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c index 9e06ebe..ebe7531 100644 --- a/av1/encoder/x86/hash_sse42.c +++ b/av1/encoder/x86/hash_sse42.c
@@ -12,6 +12,8 @@ #include <stdint.h> #include <smmintrin.h> +#include "config/av1_rtcd.h" + // Byte-boundary alignment issues #define ALIGN_SIZE 8 #define ALIGN_MASK (ALIGN_SIZE - 1)
diff --git a/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/av1/encoder/x86/highbd_block_error_intrin_avx2.c index ee3714d..340307c 100644 --- a/av1/encoder/x86/highbd_block_error_intrin_avx2.c +++ b/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -13,6 +13,7 @@ #include <stdio.h> #include "aom/aom_integer.h" #include "av1/common/common.h" +#include "config/av1_rtcd.h" int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
diff --git a/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/av1/encoder/x86/highbd_block_error_intrin_sse2.c index 0287f01..b0b2757 100644 --- a/av1/encoder/x86/highbd_block_error_intrin_sse2.c +++ b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -13,6 +13,7 @@ #include <stdio.h> #include "av1/common/common.h" +#include "config/av1_rtcd.h" int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c index 6658ed3..1f76576 100644 --- a/av1/encoder/x86/pickrst_avx2.c +++ b/av1/encoder/x86/pickrst_avx2.c
@@ -345,21 +345,27 @@ } void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c index 50db305..af67062 100644 --- a/av1/encoder/x86/pickrst_sse4.c +++ b/av1/encoder/x86/pickrst_sse4.c
@@ -10,7 +10,7 @@ */ #include <assert.h> -#include <emmintrin.h> +#include <smmintrin.h> #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" @@ -524,21 +524,27 @@ } void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, - int h_end, int v_start, int v_end, - int dgd_stride, int src_stride, int64_t *M, - int64_t *H, aom_bit_depth_t bit_depth) { + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c index 12ac146..76980d6 100644 --- a/av1/encoder/x86/rdopt_sse4.c +++ b/av1/encoder/x86/rdopt_sse4.c
@@ -10,7 +10,7 @@ */ #include <assert.h> -#include <emmintrin.h> +#include <smmintrin.h> #include "aom_dsp/x86/synonyms.h" #include "config/av1_rtcd.h" @@ -29,10 +29,8 @@ // [ i j k l ] // [ m n o p ] - const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride], - *(int64_t *)&diff[2 * stride]); - const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride], - *(int64_t *)&diff[3 * stride]); + const __m128i pixelsa = xx_loadu_2x64(&diff[0 * stride], &diff[2 * stride]); + const __m128i pixelsb = xx_loadu_2x64(&diff[1 * stride], &diff[3 * stride]); // pixelsa = [d c b a l k j i] as i16 // pixelsb = [h g f e p o n m] as i16
diff --git a/av1/encoder/x86/wedge_utils_avx2.c b/av1/encoder/x86/wedge_utils_avx2.c index 9cde860..3f61c02 100644 --- a/av1/encoder/x86/wedge_utils_avx2.c +++ b/av1/encoder/x86/wedge_utils_avx2.c
@@ -31,7 +31,7 @@ uint64_t csse; const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); - const __m256i v_zext_q = yy_set1_64_from_32i(~0); + const __m256i v_zext_q = _mm256_set1_epi64x(~0u); __m256i v_acc0_q = _mm256_setzero_si256();
diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c index d7ac222..c300579 100644 --- a/av1/encoder/x86/wedge_utils_sse2.c +++ b/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@ uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); - const __m128i v_zext_q = xx_set1_64_from_32i(~0); + const __m128i v_zext_q = _mm_set1_epi64x(~0u); __m128i v_acc0_q = _mm_setzero_si128();
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc index 62d6e74..83e88ba 100644 --- a/av1/ratectrl_rtc.cc +++ b/av1/ratectrl_rtc.cc
@@ -128,6 +128,7 @@ oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT; oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh; rc->max_consec_drop = rc_cfg.max_consec_drop; + cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP; oxcf->tool_cfg.bit_depth = AOM_BITS_8; oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC; oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL;
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake index 85390d5..980dfb9 100644 --- a/build/cmake/aom_config_defaults.cmake +++ b/build/cmake/aom_config_defaults.cmake
@@ -37,6 +37,7 @@ set_aom_detect_var(HAVE_NEON_I8MM 0 "Enables Armv8.2-A Neon i8mm intrinsics optimizations.") set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.") +set_aom_detect_var(HAVE_SVE2 0 "Enables Armv9-A SVE2 intrinsics optimizations.") # PPC feature flags. set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.") @@ -84,6 +85,9 @@ set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.") set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.") set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.") +set_aom_config_var(CONFIG_QUANT_MATRIX 1 + "Build with quantization matrices for AV1 encoder." + "AV1 decoder is always built with quantization matrices.") set_aom_config_var(CONFIG_REALTIME_ONLY 0 "Build for RTC-only. See aomcx.h for all disabled features.") set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.") @@ -168,6 +172,9 @@ "AV1 experiment: Enable saliency map based encoding tuning for VMAF.") set_aom_config_var(CONFIG_CWG_C013 0 "AV1 experiment: Support for 7.x and 8.x levels.") +# Add this change to make aomenc reported PSNR consistent with libvmaf result. +set_aom_config_var(CONFIG_LIBVMAF_PSNR_PEAK 1 + "Use libvmaf PSNR peak for 10- and 12-bit") # # Variables in this section control optional features of the build system. @@ -206,6 +213,8 @@ "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON) set_aom_option_var(ENABLE_SVE "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON) +set_aom_option_var(ENABLE_SVE2 + "Enables Armv9-A SVE2 optimizations on AArch64 targets." ON) # VSX intrinsics flags. set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake index 917e7ca..ac3e1325 100644 --- a/build/cmake/aom_configure.cmake +++ b/build/cmake/aom_configure.cmake
@@ -190,7 +190,7 @@ set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT}) elseif(AOM_TARGET_SYSTEM STREQUAL "Windows") if(NOT CMAKE_ASM_COMPILER) - set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} -c -mimplicit-it=always) + set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} "-c -mimplicit-it=always") endif() else() if(NOT CMAKE_ASM_COMPILER) @@ -320,6 +320,10 @@ # minimum supported C++ version. If Clang is using this Standard Library # implementation, it cannot target C++11. require_cxx_flag_nomsvc("-std=c++14" YES) + elseif(CYGWIN AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # The GNU C++ compiler in Cygwin needs the -std=gnu++11 flag to make the + # POSIX function declarations visible in the Standard C Library headers. + require_cxx_flag_nomsvc("-std=gnu++11" YES) else() require_cxx_flag_nomsvc("-std=c++11" YES) endif() @@ -393,6 +397,13 @@ endif() add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE") add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64") + + # Do not allow implicit vector type conversions on Clang builds (this is + # already the default on GCC builds). + if(CMAKE_C_COMPILER_ID MATCHES "Clang") + # Clang 8.0.1 (in Cygwin) doesn't support -flax-vector-conversions=none. + add_compiler_flag_if_supported("-flax-vector-conversions=none") + endif() endif() # Prior to r23, or with ANDROID_USE_LEGACY_TOOLCHAIN_FILE set,
diff --git a/build/cmake/compiler_flags.cmake b/build/cmake/compiler_flags.cmake index f008b96..3afcd50 100644 --- a/build/cmake/compiler_flags.cmake +++ b/build/cmake/compiler_flags.cmake
@@ -176,11 +176,11 @@ endif() unset(HAVE_CXX_FLAG CACHE) - message("Checking C compiler flag support for: " ${cxx_flag}) + message("Checking C++ compiler flag support for: " ${cxx_flag}) check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG) if(NOT HAVE_CXX_FLAG) message( - FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.") + FATAL_ERROR "${PROJECT_NAME} requires support for C++ flag: ${cxx_flag}.") endif() if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake index 1fa934b..6e6fdb8 100644 --- a/build/cmake/cpu.cmake +++ b/build/cmake/cpu.cmake
@@ -14,11 +14,13 @@ set(AOM_ARCH_AARCH64 1) set(RTCD_ARCH_ARM "yes") - set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE") + set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE;SVE2") set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc") set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod") set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm") set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve") + set(AOM_SVE2_DEFAULT_FLAG "-march=armv9-a+i8mm+sve2") # SVE2 is a v9-only + # feature # Check that the compiler flag to enable each flavor is supported by the # compiler. This may not be the case for new architecture features on old @@ -45,8 +47,8 @@ endif() endforeach() - # SVE requires that the Neon-SVE bridge header is also available. - if(ENABLE_SVE) + # SVE and SVE2 require that the Neon-SVE bridge header is also available. + if(ENABLE_SVE OR ENABLE_SVE2) set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(OLD_CMAKE_TRY_COMPILE_TARGET_TYPE ${CMAKE_TRY_COMPILE_TARGET_TYPE}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}") @@ -71,6 +73,7 @@ set(CMAKE_TRY_COMPILE_TARGET_TYPE ${OLD_CMAKE_TRY_COMPILE_TARGET_TYPE}) if(HAVE_SVE_HEADERS EQUAL 0 OR CAN_COMPILE_SVE EQUAL 0) set(ENABLE_SVE 0) + set(ENABLE_SVE2 0) endif() endif()
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl index 1cf52f0..f4a7084 100755 --- a/build/cmake/rtcd.pl +++ b/build/cmake/rtcd.pl
@@ -392,7 +392,7 @@ @ALL_ARCHS = filter(qw/neon/); arm; } elsif ($opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve/); + @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve sve2/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm;
diff --git a/common/args.c b/common/args.c index b5ede19..c380dde 100644 --- a/common/args.c +++ b/common/args.c
@@ -17,7 +17,6 @@ #include <limits.h> #include "aom/aom_integer.h" -#include "aom_ports/msvc.h" #include "aom/aom_codec.h" #include "common/tools_common.h"
diff --git a/common/tools_common.c b/common/tools_common.c index 4d77a1b..db02ca6 100644 --- a/common/tools_common.c +++ b/common/tools_common.c
@@ -97,7 +97,7 @@ int w = aom_img_plane_width(yuv_frame, plane); const int h = aom_img_plane_height(yuv_frame, plane); int r; - // Assuming that for nv12 we read all chroma data at one time + // Assuming that for nv12 we read all chroma data at once if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; /* Determine the correct plane based on the image format. The for-loop @@ -245,17 +245,21 @@ void aom_img_write(const aom_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * - ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; + // Assuming that for nv12 we write all chroma data at once + if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytespp, w, file); buf += stride; } } @@ -268,12 +272,16 @@ for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * bytespp; + int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) return false; + if (fread(buf, bytespp, w, file) != (size_t)w) return false; buf += stride; } }
diff --git a/common/tools_common.h b/common/tools_common.h index 9d891d1..cde2164 100644 --- a/common/tools_common.h +++ b/common/tools_common.h
@@ -20,7 +20,6 @@ #include "aom/aom_image.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" -#include "aom_ports/msvc.h" #if CONFIG_AV1_ENCODER #include "common/y4minput.h"
diff --git a/common/y4minput.c b/common/y4minput.c index 1974d76..6a8601e 100644 --- a/common/y4minput.c +++ b/common/y4minput.c
@@ -17,7 +17,6 @@ #include <string.h> #include "aom/aom_integer.h" -#include "aom_ports/msvc.h" #include "y4minput.h" // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
diff --git a/doc/dev_guide/av1_encoder.dox b/doc/dev_guide/av1_encoder.dox index 0f7e8f8..a40b589 100644 --- a/doc/dev_guide/av1_encoder.dox +++ b/doc/dev_guide/av1_encoder.dox
@@ -1313,6 +1313,34 @@ All the related functions are listed in \ref coefficient_coding. +\section architecture_simd SIMD usage + +In order to efficiently encode video on modern platforms, it is necessary to +implement optimized versions of many core encoding and decoding functions using +architecture-specific SIMD instructions. + +Functions which have optimized implementations will have multiple variants +in the code, each suffixed with the name of the appropriate instruction set. +There will additionally be an `_c` version, which acts as a reference +implementation which the SIMD variants can be tested against. + +As different machines with the same nominal architecture may support different +subsets of SIMD instructions, we have dynamic CPU detection logic which chooses +the appropriate functions to use at run time. This process is handled by +`build/cmake/rtcd.pl`, with function definitions in the files +`*_rtcd_defs.pl` elsewhere in the codebase. + +Currently SIMD is supported on the following platforms: + +- x86: Requires SSE4.1 or above + +- Arm: Requires Neon (Armv7-A and above) + +We aim to provide implementations of all performance-critical functions which +are compatible with the instruction sets listed above. Additional SIMD +extensions (e.g. AVX on x86, SVE on Arm) are also used to provide even +greater performance where available. + */ /*!\defgroup encoder_algo Encoder Algorithm
diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c index da36d9f..b7fb7bc 100644 --- a/examples/aom_cx_set_ref.c +++ b/examples/aom_cx_set_ref.c
@@ -61,7 +61,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> " "<frame> <limit(optional)>\n",
diff --git a/examples/av1_dec_fuzzer.cc b/examples/av1_dec_fuzzer.cc index 9b9a0b9..4634ca6 100644 --- a/examples/av1_dec_fuzzer.cc +++ b/examples/av1_dec_fuzzer.cc
@@ -34,6 +34,14 @@ return 0; } + // Abusing the four unused bytes at the end of the IVF file header as a source + // of random bits. + unsigned int tile_mode = (data[IVF_FILE_HDR_SZ - 1] & 2) != 0; + unsigned int ext_tile_debug = (data[IVF_FILE_HDR_SZ - 1] & 4) != 0; + unsigned int is_annexb = (data[IVF_FILE_HDR_SZ - 1] & 8) != 0; + int output_all_layers = (data[IVF_FILE_HDR_SZ - 1] & 0x10) != 0; + int operating_point = data[IVF_FILE_HDR_SZ - 2] & 0x1F; + aom_codec_iface_t *codec_interface = aom_codec_av1_dx(); aom_codec_ctx_t codec; // Set thread count in the range [1, 64]. @@ -42,6 +50,13 @@ if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) { return 0; } + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, tile_mode); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, ext_tile_debug); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, is_annexb); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, + output_all_layers); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OPERATING_POINT, + operating_point); data += IVF_FILE_HDR_SZ; size -= IVF_FILE_HDR_SZ; @@ -52,8 +67,13 @@ data += IVF_FRAME_HDR_SZ; frame_size = std::min(size, frame_size); - const aom_codec_err_t err = - aom_codec_decode(&codec, data, frame_size, nullptr); + aom_codec_stream_info_t stream_info; + stream_info.is_annexb = is_annexb; + aom_codec_err_t err = + aom_codec_peek_stream_info(codec_interface, data, size, &stream_info); + static_cast<void>(err); + + err = aom_codec_decode(&codec, data, frame_size, nullptr); static_cast<void>(err); aom_codec_iter_t iter = nullptr; aom_image_t *img = nullptr;
diff --git a/examples/inspect.c b/examples/inspect.c index ed77b5d..e285be0 100644 --- a/examples/inspect.c +++ b/examples/inspect.c
@@ -742,7 +742,7 @@ aom_free(buffer); } -void ifd_init_cb() { +void ifd_init_cb(void) { aom_inspect_init ii; ii.inspect_cb = inspect; ii.inspect_ctx = NULL; @@ -775,7 +775,7 @@ size_t frame_size = 0; EMSCRIPTEN_KEEPALIVE -int read_frame() { +int read_frame(void) { img = NULL; // This loop skips over any frames that are show_existing_frames, as @@ -824,16 +824,18 @@ } EMSCRIPTEN_KEEPALIVE -const char *get_aom_codec_build_config() { return aom_codec_build_config(); } +const char *get_aom_codec_build_config(void) { + return aom_codec_build_config(); +} EMSCRIPTEN_KEEPALIVE -int get_bit_depth() { return img->bit_depth; } +int get_bit_depth(void) { return img->bit_depth; } EMSCRIPTEN_KEEPALIVE -int get_bits_per_sample() { return img->bps; } +int get_bits_per_sample(void) { return img->bps; } EMSCRIPTEN_KEEPALIVE -int get_image_format() { return img->fmt; } +int get_image_format(void) { return img->fmt; } EMSCRIPTEN_KEEPALIVE unsigned char *get_plane(int plane) { return img->planes[plane]; } @@ -848,10 +850,10 @@ int get_plane_height(int plane) { return aom_img_plane_height(img, plane); } EMSCRIPTEN_KEEPALIVE -int get_frame_width() { return info->frame_width; } +int get_frame_width(void) { return info->frame_width; } EMSCRIPTEN_KEEPALIVE -int get_frame_height() { return info->frame_height; } +int get_frame_height(void) { return info->frame_height; } static void parse_args(char **argv) { char **argi, **argj; @@ -949,7 +951,7 @@ } EMSCRIPTEN_KEEPALIVE -void quit() { +void quit(void) { if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); aom_video_reader_close(reader); }
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc index c37df79..c751e98 100644 --- a/examples/svc_encoder_rtc.cc +++ b/examples/svc_encoder_rtc.cc
@@ -1442,6 +1442,35 @@ return 63; } +static void set_active_map(const aom_codec_enc_cfg_t *cfg, + aom_codec_ctx_t *codec, int frame_cnt) { + aom_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + + map.active_map = (uint8_t *)malloc(map.rows * map.cols); + if (!map.active_map) die("Failed to allocate active map"); + + // Example map for testing. + for (unsigned int i = 0; i < map.rows; ++i) { + for (unsigned int j = 0; j < map.cols; ++j) { + int index = map.cols * i + j; + map.active_map[index] = 1; + if (frame_cnt < 300) { + if (i < map.rows / 2 && j < map.cols / 2) map.active_map[index] = 0; + } else if (frame_cnt >= 300) { + if (i < map.rows / 2 && j >= map.cols / 2) map.active_map[index] = 0; + } + } + } + + if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); + + free(map.active_map); +} + int main(int argc, const char **argv) { AppInput app_input; AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL }; @@ -1494,6 +1523,9 @@ // Flag to test setting speed per layer. const int test_speed_per_layer = 0; + // Flag for testing active maps. + const int test_active_maps = 0; + /* Setup default input stream settings */ app_input.input_ctx.framerate.numerator = 30; app_input.input_ctx.framerate.denominator = 1; @@ -1675,6 +1707,9 @@ aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX); + aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE, + AOM_FULL_SUPERFRAME_DROP); + svc_params.number_spatial_layers = ss_number_layers; svc_params.number_temporal_layers = ts_number_layers; for (i = 0; i < ss_number_layers * ts_number_layers; ++i) { @@ -1871,6 +1906,8 @@ } } + if (test_active_maps) set_active_map(&cfg, &codec, frame_cnt); + // Do the layer encode. aom_usec_timer_start(&timer); if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags))
diff --git a/libs.doxy_template b/libs.doxy_template index ba77751..01da81a 100644 --- a/libs.doxy_template +++ b/libs.doxy_template
@@ -1219,15 +1219,6 @@ HTML_COLORSTYLE_GAMMA = 80 -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via Javascript. If disabled, the navigation index will @@ -1509,17 +1500,6 @@ FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX @@ -1820,14 +1800,6 @@ LATEX_BIB_STYLE = plain -# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated -# page will contain the date and time when the page was generated. Setting this -# to NO can help when comparing the output of multiple runs. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_TIMESTAMP = NO - # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) # path from which the emoji images will be read. If a relative path is entered, # it will be relative to the LATEX_OUTPUT directory. If left blank the @@ -2167,23 +2139,6 @@ DOT_NUM_THREADS = 0 -# When you want a differently looking font in the dot files that doxygen -# generates you can specify the font name using DOT_FONTNAME. You need to make -# sure dot is able to find the font, which can be done by putting it in a -# standard location or by setting the DOTFONTPATH environment variable or by -# setting DOT_FONTPATH to the directory containing the font. -# The default value is: Helvetica. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_FONTNAME = Helvetica - -# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of -# dot graphs. -# Minimum value: 4, maximum value: 24, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_FONTSIZE = 10 - # By default doxygen will tell dot to use the default font as specified with # DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set # the path where dot can find it using this tag. @@ -2401,18 +2356,6 @@ MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not seem -# to support this out of the box. -# -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). -# The default value is: NO. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_TRANSPARENT = NO - # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) support
diff --git a/stats/rate_hist.c b/stats/rate_hist.c index ae76fda..d79ebc5 100644 --- a/stats/rate_hist.c +++ b/stats/rate_hist.c
@@ -42,8 +42,7 @@ if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 || fps->den == 0) { - destroy_rate_histogram(hist); - return NULL; + goto fail; } // Determine the number of samples in the buffer. Use the file's framerate @@ -59,6 +58,7 @@ hist->pts = calloc(hist->samples, sizeof(*hist->pts)); hist->sz = calloc(hist->samples, sizeof(*hist->sz)); + if (hist->pts == NULL || hist->sz == NULL) goto fail; for (i = 0; i < RATE_BINS; i++) { hist->bucket[i].low = INT_MAX; hist->bucket[i].high = 0; @@ -66,6 +66,14 @@ } return hist; + +fail: + fprintf(stderr, + "Warning: Unable to allocate buffers required for " + "show_rate_histogram().\n" + "Continuing without rate histogram feature...\n"); + destroy_rate_histogram(hist); + return NULL; } void destroy_rate_histogram(struct rate_hist *hist) {
diff --git a/test/accounting_test.cc b/test/accounting_test.cc index 8b5c8af..033499d 100644 --- a/test/accounting_test.cc +++ b/test/accounting_test.cc
@@ -33,7 +33,7 @@ aom_write(&bw, 0, 32); aom_write(&bw, 0, 32); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos);
diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 979ee6b..de16541 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc
@@ -19,8 +19,10 @@ namespace { +// Params: test mode, speed, aq_mode and screen_content mode. class ActiveMapTest - : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, + : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int, + int, int>, public ::libaom_test::EncoderTest { protected: static const int kWidth = 208; @@ -32,6 +34,8 @@ void SetUp() override { InitializeConfig(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); + aq_mode_ = GET_PARAM(3); + screen_mode_ = GET_PARAM(4); } void PreEncodeFrameHook(::libaom_test::VideoSource *video, @@ -41,6 +45,9 @@ encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0); encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); encoder->Control(AV1E_SET_ENABLE_OBMC, 0); + encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); + encoder->Control(AV1E_SET_TUNE_CONTENT, screen_mode_); + if (screen_mode_) encoder->Control(AV1E_SET_ENABLE_PALETTE, 1); } else if (video->frame() == 3) { aom_active_map_t map = aom_active_map_t(); /* clang-format off */ @@ -79,19 +86,22 @@ cfg_.g_pass = AOM_RC_ONE_PASS; cfg_.rc_end_usage = AOM_CBR; cfg_.kf_max_dist = 90000; - ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30, - 1, 0, 20); + ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 100, + 1, 0, 100); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } int cpu_used_; + int aq_mode_; + int screen_mode_; }; TEST_P(ActiveMapTest, Test) { DoTest(); } AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest, ::testing::Values(::libaom_test::kRealTime), - ::testing::Range(5, 9)); + ::testing::Range(5, 12), ::testing::Values(0, 3), + ::testing::Values(0, 1)); } // namespace
diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc index ad48e73..0dfb912 100644 --- a/test/aom_image_test.cc +++ b/test/aom_image_test.cc
@@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <climits> + #include "aom/aom_image.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -47,6 +49,16 @@ 0); } +TEST(AomImageTest, AomImgAllocNone) { + const int kWidth = 128; + const int kHeight = 128; + + aom_image_t img; + aom_img_fmt_t format = AOM_IMG_FMT_NONE; + unsigned int align = 32; + ASSERT_EQ(aom_img_alloc(&img, format, kWidth, kHeight, align), nullptr); +} + TEST(AomImageTest, AomImgAllocNv12) { const int kWidth = 128; const int kHeight = 128; @@ -54,9 +66,72 @@ aom_image_t img; aom_img_fmt_t format = AOM_IMG_FMT_NV12; unsigned int align = 32; - EXPECT_NE(aom_img_alloc(&img, format, kWidth, kHeight, align), nullptr); + EXPECT_EQ(aom_img_alloc(&img, format, kWidth, kHeight, align), &img); EXPECT_EQ(img.stride[AOM_PLANE_U], img.stride[AOM_PLANE_Y]); EXPECT_EQ(img.stride[AOM_PLANE_V], 0); EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr); aom_img_free(&img); } + +TEST(AomImageTest, AomImgAllocHugeWidth) { + // The stride (0x80000000 * 2) would overflow unsigned int. + aom_image_t *image = + aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The stride (0x80000000) would overflow int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The aligned width (UINT_MAX + 1) would overflow unsigned int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, UINT_MAX, 1, 1); + ASSERT_EQ(image, nullptr); + + image = aom_img_alloc_with_border(nullptr, AOM_IMG_FMT_I422, 1, INT_MAX, 1, + 0x40000000, 0); + if (image) { + uint16_t *y_plane = + reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x7ffffffe, 1, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_NV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_YV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 65536, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 285245883, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } +}
diff --git a/test/av1_c_vs_simd_encode.sh b/test/av1_c_vs_simd_encode.sh old mode 100644 new mode 100755 index cc547c8..897ac08 --- a/test/av1_c_vs_simd_encode.sh +++ b/test/av1_c_vs_simd_encode.sh
@@ -10,14 +10,18 @@ ## ## This script checks the bit exactness between C and SIMD ## implementations of AV1 encoder. +## +. $(dirname $0)/tools_common.sh PRESETS="good rt" -LOWBD_CLIPS="yuv_raw_input yuv_480p_raw_input y4m_720p_input y4m_screen_input" -HIGHBD_CLIPS="y4m_360p_10bit_input" +LOWBD_CIF_CLIP="yuv_raw_input" +LOWBD_480p_CLIP="yuv_480p_raw_input" +LOWBD_720p_CLIP="y4m_720p_input" +HIGHBD_CLIP="y4m_360p_10bit_input" +SC_CLIP="y4m_screen_input" OUT_FILE_SUFFIX=".ivf" SCRIPT_DIR=$(dirname "$0") LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd) -devnull='> /dev/null 2>&1' # Clips used in test. YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" @@ -93,21 +97,23 @@ fi } -cleanup() { - rm -rf ${AOM_TEST_OUTPUT_DIR} -} +# This is not needed since tools_common.sh does the same cleanup. +# Keep the code here for our reference. +# cleanup() { +# rm -rf ${AOM_TEST_OUTPUT_DIR} +# } # Echo AOM_SIMD_CAPS_MASK for different instruction set architecture. -avx512f() { +avx2() { echo "0x1FF" } -avx2() { - echo "0x0FF" +avx() { + echo "0x17F" } -avx() { - echo "0x07F" +sse4_2() { + echo "0x13F" } sse4_1() { @@ -131,15 +137,15 @@ local preset=$2 # Bit-rates: - local bitrate_lowres_good="100 1000" - local bitrate_480p_good="200 2000" - local bitrate_720p_good="600 6000" - local bitrate_scc_360p_good="400 1200" - local bitrate_lowres_rt="50 400" - local bitrate_480p_rt="100 1800" - local bitrate_720p_rt="150 2000" - local bitrate_scc_360p_rt="400 800" - local bitrate_hbd_360p="100 1600" + local bitrate_lowres_good="300" + local bitrate_480p_good="500" + local bitrate_720p_good="1000" + local bitrate_scc_360p_good="500" + local bitrate_lowres_rt="200" + local bitrate_480p_rt="300" + local bitrate_720p_rt="600" + local bitrate_scc_360p_rt="300" + local bitrate_hbd_360p="500" if [ "${preset}" = "good" ]; then if [ "${content}" = "yuv_raw_input" ]; then @@ -208,8 +214,8 @@ has_x86_isa_extn() { instruction_set=$1 - grep -q "$instruction_set" /proc/cpuinfo - if [ $? -eq 1 ]; then + if ! grep -q "$instruction_set" /proc/cpuinfo; then + # This instruction set is not supported. return 1 fi } @@ -297,7 +303,8 @@ -DCMAKE_BUILD_TYPE=Release \ -DENABLE_CCACHE=1 \ '-DCMAKE_C_FLAGS_RELEASE=-O3 -g' \ - '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g'" + '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g' \ + -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DENABLE_TOOLS=0" for preset in $PRESETS; do echo "Building target[${preset} encoding]: ${target}" @@ -309,8 +316,16 @@ elog "Invalid preset" return 1 fi - eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" ${devnull} - eval make -j$(nproc) ${devnull} + if ! eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" \ + ${devnull}; then + elog "cmake failure" + return 1 + fi + if ! eval make -j$(nproc) aomenc ${devnull}; then + elog "build failure" + return 1 + fi + mv aomenc aomenc_${preset} done echo "Done building target: ${target}" @@ -322,9 +337,8 @@ local clip=$3 local bitrate=$4 local preset=$5 - diff ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ - ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null - if [ $? -eq 1 ]; then + if ! diff -q ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" return 1 fi @@ -332,35 +346,58 @@ av1_enc_test() { local encoder="$1" - local target="$2" - local preset="$3" + local arch="$2" + local target="$3" + local preset="$4" if [ -z "$(av1_enc_tool_path "${target}" "${preset}")" ]; then elog "aomenc_{preset} not found. It must exist in ${AOM_TEST_OUTPUT_DIR}/build_target_${target} path" return 1 fi if [ "${preset}" = "good" ]; then - local min_cpu_used=0 - local max_cpu_used=6 - local test_params=av1_encode_good_params - if [ "${target}" = "armv8-linux-gcc" ]; then - # TODO(BUG=aomedia:3474): Enable testing of high bit-depth clips after - # fixing C vs SIMD mismatches. - local test_clips="${LOWBD_CLIPS}" - else - local test_clips="${LOWBD_CLIPS} ${HIGHBD_CLIPS}" + if [ "${arch}" = "x86_64" ]; then + local min_cpu_used=0 + local max_cpu_used=6 + elif [ "${arch}" = "x86" ]; then + local min_cpu_used=2 + local max_cpu_used=3 fi + local test_params=av1_encode_good_params elif [ "${preset}" = "rt" ]; then local min_cpu_used=5 - local max_cpu_used=10 + local max_cpu_used=11 local test_params=av1_encode_rt_params - local test_clips="${LOWBD_CLIPS}" else elog "Invalid preset" return 1 fi for cpu in $(seq $min_cpu_used $max_cpu_used); do + if [ "${preset}" = "good" ]; then + if [ "${arch}" = "x86_64" ]; then + if [ "${cpu}" -lt 2 ]; then + local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}" + elif [ "${cpu}" -lt 5 ]; then + local test_clips="${LOWBD_480p_CLIP} ${HIGHBD_CLIP}" + else + local test_clips="${LOWBD_720p_CLIP} ${HIGHBD_CLIP}" + fi + elif [ "${arch}" = "x86" ]; then + local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}" + elif [ "${arch}" = "arm64" ]; then + local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}" + fi + elif [ "${preset}" = "rt" ]; then + if [ "${cpu}" -lt 8 ]; then + local test_clips="${LOWBD_CIF_CLIP} ${SC_CLIP}" + else + local test_clips="${LOWBD_480p_CLIP} ${SC_CLIP}" + fi + else + elog "Invalid preset" + return 1 + fi + for clip in ${test_clips}; do local test_bitrates=$(get_bitrates ${clip} ${preset}) for bitrate in ${test_bitrates}; do @@ -371,8 +408,8 @@ ${devnull} if [ "${target}" != "generic" ]; then - compare_enc_output ${target} $cpu ${clip} $bitrate ${preset} - if [ $? -eq 1 ]; then + if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then + # Found a mismatch return 1 fi fi @@ -392,40 +429,41 @@ # The cmake command line option -DENABLE_MMX=0 flag disables all SIMD # optimizations, and generates a C-only binary. local cmake_command="cmake $LIBAOM_SOURCE_DIR -DENABLE_MMX=0 \ - -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/${arch}-linux.cmake" + -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake" fi echo "Build for: Generic ${arch}" - av1_enc_build "${target}" "${cmake_command}" + if ! av1_enc_build "${target}" "${cmake_command}"; then + return 1 + fi for preset in $PRESETS; do local encoder="$(av1_enc_tool_path "${target}" "${preset}")" - av1_enc_test $encoder "${target}" "${preset}" + av1_enc_test $encoder "${arch}" "${target}" "${preset}" done } -# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are -# no functions with MMX, SSE and AVX512 specialization. +# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, AVX, AVX2 as +# there are no functions with MMX, SSE and AVX512 specialization. # The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction # set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding # instruction set extension optimization enabled are as follows: -# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX -# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants -# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants -# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants -# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants -# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants -# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants -# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants -# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants -# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +# SSE4_2 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX2 and lower variants +# 1 0 1 1 1 1 1 1 1 -> 0x17F -> Enable AVX and lower variants +# 1 0 0 1 1 1 1 1 1 -> 0x13F -> Enable SSE4_2 and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX ## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as # all x86_64 platforms implement sse2. av1_test_x86() { local arch=$1 - uname -m | grep -q "x86" - if [ $? -eq 1 ]; then + if ! uname -m | grep -q "x86"; then elog "Machine architecture is not x86 or x86_64" return 0 fi @@ -434,28 +472,31 @@ local target="x86-linux" local cmake_command="cmake \ $LIBAOM_SOURCE_DIR \ - -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/${target}.cmake" + -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake" elif [ $arch = "x86_64" ]; then local target="x86_64-linux" local cmake_command="cmake $LIBAOM_SOURCE_DIR" fi - local x86_isa_variants="avx2 avx sse4_1 ssse3 sse3 sse2" + # Available x86 isa variants: "avx2 avx sse4_2 sse4_1 ssse3 sse3 sse2" + local x86_isa_variants="avx2 sse4_2 sse2" echo "Build for x86: ${target}" - av1_enc_build "${target}" "${cmake_command}" + if ! av1_enc_build "${target}" "${cmake_command}"; then + return 1 + fi for preset in $PRESETS; do local encoder="$(av1_enc_tool_path "${target}" "${preset}")" for isa in $x86_isa_variants; do - has_x86_isa_extn $isa - if [ $? -eq 1 ]; then + # Note that if has_x86_isa_extn returns 1, it is false, and vice versa. + if ! has_x86_isa_extn $isa; then echo "${isa} is not supported in this machine" continue fi export AOM_SIMD_CAPS_MASK=$($isa) - av1_enc_test $encoder "${target}" "${preset}" - if [ $? -eq 1 ]; then + if ! av1_enc_test $encoder "${arch}" "${target}" "${preset}"; then + # Found a mismatch return 1 fi unset AOM_SIMD_CAPS_MASK @@ -464,23 +505,20 @@ } av1_test_arm() { + local arch="arm64" local target="arm64-linux-gcc" local cmake_command="cmake $LIBAOM_SOURCE_DIR \ -DCMAKE_TOOLCHAIN_FILE=$LIBAOM_SOURCE_DIR/build/cmake/toolchains/${target}.cmake \ -DCMAKE_C_FLAGS=-Wno-maybe-uninitialized" echo "Build for arm64: ${target}" - av1_enc_build "${target}" "${cmake_command}" + if ! av1_enc_build "${target}" "${cmake_command}"; then + return 1 + fi for preset in $PRESETS; do - # Enable armv8 test for real-time only - # TODO(BUG=aomedia:3486, BUG=aomedia:3474): Enable testing for 'good' preset - # after fixing C vs NEON mismatches. - if [ "${preset}" = "good" ]; then - continue - fi local encoder="$(av1_enc_tool_path "${target}" "${preset}")" - av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${target}" "${preset}" - if [ $? -eq 1 ]; then + if ! av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${arch}" "${target}" "${preset}"; then + # Found a mismatch return 1 fi done @@ -488,14 +526,15 @@ av1_c_vs_simd_enc_test () { # Test x86 (32 bit) + # x86 requires the i686-linux-gnu toolchain: + # $ sudo apt-get install g++-i686-linux-gnu echo "av1 test for x86 (32 bit): Started." # Encode 'C' only av1_test_generic "x86" - # Encode with SIMD optimizations enabled - av1_test_x86 "x86" - if [ $? -eq 1 ]; then + if ! av1_test_x86 "x86"; then echo "av1 test for x86 (32 bit): Done, test failed." + return 1 else echo "av1 test for x86 (32 bit): Done, all tests passed." fi @@ -506,9 +545,9 @@ # Encode 'C' only av1_test_generic "x86_64" # Encode with SIMD optimizations enabled - av1_test_x86 "x86_64" - if [ $? -eq 1 ]; then + if ! av1_test_x86 "x86_64"; then echo "av1 test for x86_64 (64 bit): Done, test failed." + return 1 else echo "av1 test for x86_64 (64 bit): Done, all tests passed." fi @@ -516,20 +555,12 @@ # Test ARM echo "av1_test_arm: Started." - av1_test_arm - if [ $? -eq 1 ]; then + if ! av1_test_arm; then echo "av1 test for arm: Done, test failed." + return 1 else echo "av1 test for arm: Done, all tests passed." fi } -# Setup a trap function to clean up build, and output files after tests complete. -trap cleanup EXIT - -av1_c_vs_simd_enc_verify_environment -if [ $? -eq 1 ]; then - echo "Environment check failed." - exit 1 -fi -av1_c_vs_simd_enc_test +run_tests av1_c_vs_simd_enc_verify_environment av1_c_vs_simd_enc_test
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc index 76cf77a..b6458b0 100644 --- a/test/av1_convolve_scale_test.cc +++ b/test/av1_convolve_scale_test.cc
@@ -14,6 +14,7 @@ #include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_ports/aom_timer.h" @@ -22,6 +23,7 @@ #include "test/util.h" #include "av1/common/common_data.h" +#include "av1/common/filter.h" namespace { const int kTestIters = 10; @@ -32,80 +34,12 @@ const int kXStepQn = 16; const int kYStepQn = 20; +const int kNumFilterBanks = SWITCHABLE_FILTERS; + using libaom_test::ACMRandom; using std::make_tuple; using std::tuple; -enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP }; -int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; } - -// A 16-bit filter with a configurable number of taps. -class TestFilter { - public: - void set(NTaps ntaps, bool backwards); - - InterpFilterParams params_; - - private: - std::vector<int16_t> coeffs_; -}; - -void TestFilter::set(NTaps ntaps, bool backwards) { - const int n = NTapsToInt(ntaps); - assert(n >= 8 && n <= 12); - - // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus - // elements at the end so that convolutions can read off the end safely. - coeffs_.resize(n * SUBPEL_SHIFTS + 8); - - // The coefficients are pretty much arbitrary, but convolutions shouldn't - // over or underflow. For the first filter (subpels = 0), we use an - // increasing or decreasing ramp (depending on the backwards parameter). We - // don't want any zero coefficients, so we make it have an x-intercept at -1 - // or n. To ensure absence of under/overflow, we normalise the area under the - // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function - // gives the identity). - // - // When increasing, the function has the form: - // - // f(x) = A * (x + 1) - // - // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the - // filter is reversed, we have the same A but with formula - // - // g(x) = A * (n - x) - const int I = 1 << FILTER_BITS; - const float A = 2.f * I / (n * (n + 1.f)); - for (int i = 0; i < n; ++i) { - coeffs_[i] = static_cast<int16_t>(A * (backwards ? (n - i) : (i + 1))); - } - - // For the other filters, make them slightly different by swapping two - // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped. - const size_t filter_size = sizeof(coeffs_[0] * n); - int16_t *const filter0 = &coeffs_[0]; - for (int k = 1; k < SUBPEL_SHIFTS; ++k) { - int16_t *filterk = &coeffs_[k * n]; - memcpy(filterk, filter0, filter_size); - - const int idx0 = k % n; - const int idx1 = (7 * k) % n; - - const int16_t tmp = filterk[idx0]; - filterk[idx0] = filterk[idx1]; - filterk[idx1] = tmp; - } - - // Finally, write some rubbish at the end to make sure we don't use it. - for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i; - - // Fill in params - params_.filter_ptr = &coeffs_[0]; - params_.taps = n; - // These are ignored by the functions being tested. Set them to whatever. - params_.interp_filter = EIGHTTAP_REGULAR; -} - template <typename SrcPixel> class TestImage { public: @@ -244,14 +178,9 @@ typedef tuple<int, int> BlockDimension; struct BaseParams { - BaseParams(BlockDimension dimensions, NTaps num_taps_x, NTaps num_taps_y, - bool average) - : dims(dimensions), ntaps_x(num_taps_x), ntaps_y(num_taps_y), - avg(average) {} + BaseParams(BlockDimension dimensions) : dims(dimensions) {} BlockDimension dims; - NTaps ntaps_x, ntaps_y; - bool avg; }; template <typename SrcPixel> @@ -271,54 +200,62 @@ void SetParams(const BaseParams ¶ms, int bd) { width_ = std::get<0>(params.dims); height_ = std::get<1>(params.dims); - ntaps_x_ = params.ntaps_x; - ntaps_y_ = params.ntaps_y; bd_ = bd; - avg_ = params.avg; - - filter_x_.set(ntaps_x_, false); - filter_y_.set(ntaps_y_, true); - convolve_params_ = - get_conv_params_no_round(avg_ != false, 0, nullptr, 0, 1, bd); delete image_; image_ = new TestImage<SrcPixel>(width_, height_, bd_); ASSERT_NE(image_, nullptr); } - void SetConvParamOffset(int i, int j, int is_compound, int do_average, - int use_dist_wtd_comp_avg) { - if (i == -1 && j == -1) { - convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; - convolve_params_.is_compound = is_compound; - convolve_params_.do_average = do_average; - } else { - convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; - convolve_params_.fwd_offset = quant_dist_lookup_table[j][i]; - convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i]; - convolve_params_.is_compound = is_compound; - convolve_params_.do_average = do_average; + std::vector<ConvolveParams> GetConvParams() { + std::vector<ConvolveParams> convolve_params; + + ConvolveParams param_no_compound = + get_conv_params_no_round(0, 0, nullptr, 0, 0, bd_); + convolve_params.push_back(param_no_compound); + + ConvolveParams param_compound_avg = + get_conv_params_no_round(1, 0, nullptr, 0, 1, bd_); + convolve_params.push_back(param_compound_avg); + + ConvolveParams param_compound_avg_dist_wtd = param_compound_avg; + param_compound_avg_dist_wtd.use_dist_wtd_comp_avg = 1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 4; ++j) { + param_compound_avg_dist_wtd.fwd_offset = quant_dist_lookup_table[j][i]; + param_compound_avg_dist_wtd.bck_offset = + quant_dist_lookup_table[j][1 - i]; + convolve_params.push_back(param_compound_avg_dist_wtd); + } } + + return convolve_params; } void Run() { ACMRandom rnd(ACMRandom::DeterministicSeed()); - for (int i = 0; i < kTestIters; ++i) { - int is_compound = 0; - SetConvParamOffset(-1, -1, is_compound, 0, 0); - Prep(&rnd); - RunOne(true); - RunOne(false); - image_->Check(); + std::vector<ConvolveParams> conv_params = GetConvParams(); - is_compound = 1; - for (int do_average = 0; do_average < 2; do_average++) { - for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2; - use_dist_wtd_comp_avg++) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 4; ++k) { - SetConvParamOffset(j, k, is_compound, do_average, - use_dist_wtd_comp_avg); + for (int i = 0; i < kTestIters; ++i) { + for (int subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + for (int filter_bank_y = 0; filter_bank_y < kNumFilterBanks; + ++filter_bank_y) { + const InterpFilter filter_y = + static_cast<InterpFilter>(filter_bank_y); + filter_y_ = + av1_get_interp_filter_params_with_block_size(filter_y, width_); + + for (int filter_bank_x = 0; filter_bank_x < kNumFilterBanks; + ++filter_bank_x) { + const InterpFilter filter_x = + static_cast<InterpFilter>(filter_bank_x); + filter_x_ = + av1_get_interp_filter_params_with_block_size(filter_x, width_); + + for (const auto c : conv_params) { + convolve_params_ = c; Prep(&rnd); RunOne(true); RunOne(false); @@ -329,7 +266,6 @@ } } } - void SpeedTest() { ACMRandom rnd(ACMRandom::DeterministicSeed()); Prep(&rnd); @@ -370,8 +306,8 @@ assert(rnd); // Choose subpel_x_ and subpel_y_. They should be less than - // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting" - // values: 0 and SCALE_SUBPEL_SHIFTS - 1 + // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to + // "interesting" values: 0 and SCALE_SUBPEL_SHIFTS - 1 subpel_x_ = RandomSubpel(rnd); subpel_y_ = RandomSubpel(rnd); @@ -379,10 +315,8 @@ } int width_, height_, bd_; - NTaps ntaps_x_, ntaps_y_; - bool avg_; int subpel_x_, subpel_y_; - TestFilter filter_x_, filter_y_; + const InterpFilterParams *filter_x_, *filter_y_; TestImage<SrcPixel> *image_; ConvolveParams convolve_params_; }; @@ -398,9 +332,8 @@ ConvolveParams *conv_params); // Test parameter list: -// <tst_fun, dims, ntaps_x, ntaps_y, avg> -typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool> - LowBDParams; +// <tst_fun, dims, avg> +typedef tuple<LowbdConvolveFunc, BlockDimension> LowBDParams; class LowBDConvolveScaleTest : public ConvolveScaleTestBase<uint8_t>, @@ -412,12 +345,9 @@ tst_fun_ = GET_PARAM(0); const BlockDimension &block = GET_PARAM(1); - const NTaps ntaps_x = GET_PARAM(2); - const NTaps ntaps_y = GET_PARAM(3); const int bd = 8; - const bool avg = GET_PARAM(4); - SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd); + SetParams(BaseParams(block), bd); } void RunOne(bool ref) override { @@ -428,12 +358,12 @@ const int dst_stride = image_->dst_stride(); if (ref) { av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_, - &filter_x_.params_, &filter_y_.params_, subpel_x_, - kXStepQn, subpel_y_, kYStepQn, &convolve_params_); + filter_x_, filter_y_, subpel_x_, kXStepQn, + subpel_y_, kYStepQn, &convolve_params_); } else { - tst_fun_(src, src_stride, dst, dst_stride, width_, height_, - &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn, - subpel_y_, kYStepQn, &convolve_params_); + tst_fun_(src, src_stride, dst, dst_stride, width_, height_, filter_x_, + filter_y_, subpel_x_, kXStepQn, subpel_y_, kYStepQn, + &convolve_params_); } } @@ -450,25 +380,40 @@ make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128), }; -const NTaps kNTaps[] = { EIGHT_TAP }; - TEST_P(LowBDConvolveScaleTest, Check) { Run(); } TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); } INSTANTIATE_TEST_SUITE_P( C, LowBDConvolveScaleTest, ::testing::Combine(::testing::Values(av1_convolve_2d_scale_c), - ::testing::ValuesIn(kBlockDim), - ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), - ::testing::Bool())); + ::testing::ValuesIn(kBlockDim))); + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, LowBDConvolveScaleTest, + ::testing::Combine(::testing::Values(av1_convolve_2d_scale_neon), + ::testing::ValuesIn(kBlockDim))); +#endif // HAVE_NEON + +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, LowBDConvolveScaleTest, + ::testing::Combine(::testing::Values(av1_convolve_2d_scale_neon_dotprod), + ::testing::ValuesIn(kBlockDim))); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_NEON_I8MM +INSTANTIATE_TEST_SUITE_P( + NEON_I8MM, LowBDConvolveScaleTest, + ::testing::Combine(::testing::Values(av1_convolve_2d_scale_neon_i8mm), + ::testing::ValuesIn(kBlockDim))); +#endif // HAVE_NEON_I8MM #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, LowBDConvolveScaleTest, ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1), - ::testing::ValuesIn(kBlockDim), - ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), - ::testing::Bool())); + ::testing::ValuesIn(kBlockDim))); #endif // HAVE_SSE4_1 #if CONFIG_AV1_HIGHBITDEPTH @@ -481,9 +426,8 @@ ConvolveParams *conv_params, int bd); // Test parameter list: -// <tst_fun, dims, ntaps_x, ntaps_y, avg, bd> -typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int> - HighBDParams; +// <tst_fun, dims, avg, bd> +typedef tuple<HighbdConvolveFunc, BlockDimension, int> HighBDParams; class HighBDConvolveScaleTest : public ConvolveScaleTestBase<uint16_t>, @@ -495,12 +439,9 @@ tst_fun_ = GET_PARAM(0); const BlockDimension &block = GET_PARAM(1); - const NTaps ntaps_x = GET_PARAM(2); - const NTaps ntaps_y = GET_PARAM(3); - const bool avg = GET_PARAM(4); - const int bd = GET_PARAM(5); + const int bd = GET_PARAM(2); - SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd); + SetParams(BaseParams(block), bd); } void RunOne(bool ref) override { @@ -511,14 +452,14 @@ const int dst_stride = image_->dst_stride(); if (ref) { - av1_highbd_convolve_2d_scale_c( - src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_, - &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn, - &convolve_params_, bd_); + av1_highbd_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, + height_, filter_x_, filter_y_, subpel_x_, + kXStepQn, subpel_y_, kYStepQn, + &convolve_params_, bd_); } else { - tst_fun_(src, src_stride, dst, dst_stride, width_, height_, - &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn, - subpel_y_, kYStepQn, &convolve_params_, bd_); + tst_fun_(src, src_stride, dst, dst_stride, width_, height_, filter_x_, + filter_y_, subpel_x_, kXStepQn, subpel_y_, kYStepQn, + &convolve_params_, bd_); } } @@ -535,16 +476,14 @@ C, HighBDConvolveScaleTest, ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_c), ::testing::ValuesIn(kBlockDim), - ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), - ::testing::Bool(), ::testing::ValuesIn(kBDs))); + ::testing::ValuesIn(kBDs))); #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, HighBDConvolveScaleTest, ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1), ::testing::ValuesIn(kBlockDim), - ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), - ::testing::Bool(), ::testing::ValuesIn(kBDs))); + ::testing::ValuesIn(kBDs))); #endif // HAVE_SSE4_1 #if HAVE_NEON @@ -552,8 +491,7 @@ NEON, HighBDConvolveScaleTest, ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_neon), ::testing::ValuesIn(kBlockDim), - ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), - ::testing::Bool(), ::testing::ValuesIn(kBDs))); + ::testing::ValuesIn(kBDs))); #endif // HAVE_NEON
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc index 5bbac21..2c630b7 100644 --- a/test/av1_convolve_test.cc +++ b/test/av1_convolve_test.cc
@@ -325,7 +325,8 @@ class AV1ConvolveXTest : public AV1ConvolveTest<convolve_x_func> { public: void RunTest() { - for (int sub_x = 0; sub_x < 16; ++sub_x) { + // Do not test the no-op filter. + for (int sub_x = 1; sub_x < 16; ++sub_x) { for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL; ++filter) { InterpFilter f = static_cast<InterpFilter>(filter); @@ -530,7 +531,8 @@ class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> { public: void RunTest() { - for (int sub_x = 0; sub_x < 16; ++sub_x) { + // Do not test the no-op filter. + for (int sub_x = 1; sub_x < 16; ++sub_x) { for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL; ++filter) { InterpFilter f = static_cast<InterpFilter>(filter); @@ -631,6 +633,11 @@ BuildHighbdParams(av1_highbd_convolve_x_sr_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1ConvolveXHighbdTest, + BuildHighbdParams(av1_highbd_convolve_x_sr_sve2)); +#endif + ///////////////////////////////////////////////////////////////// // Single reference convolve-x IntraBC functions (high bit-depth) ///////////////////////////////////////////////////////////////// @@ -732,7 +739,8 @@ class AV1ConvolveYTest : public AV1ConvolveTest<convolve_y_func> { public: void RunTest() { - for (int sub_y = 0; sub_y < 16; ++sub_y) { + // Do not test the no-op filter. + for (int sub_y = 1; sub_y < 16; ++sub_y) { for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL; ++filter) { InterpFilter f = static_cast<InterpFilter>(filter); @@ -822,6 +830,16 @@ BuildLowbdParams(av1_convolve_y_sr_neon)); #endif +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1ConvolveYTest, + BuildLowbdParams(av1_convolve_y_sr_neon_dotprod)); +#endif + +#if HAVE_NEON_I8MM +INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1ConvolveYTest, + BuildLowbdParams(av1_convolve_y_sr_neon_i8mm)); +#endif + //////////////////////////////////////////////////////////////// // Single reference convolve-y IntraBC functions (low bit-depth) //////////////////////////////////////////////////////////////// @@ -908,7 +926,8 @@ class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> { public: void RunTest() { - for (int sub_y = 0; sub_y < 16; ++sub_y) { + // Do not test the no-op filter. + for (int sub_y = 1; sub_y < 16; ++sub_y) { for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL; ++filter) { InterpFilter f = static_cast<InterpFilter>(filter); @@ -998,6 +1017,11 @@ BuildHighbdParams(av1_highbd_convolve_y_sr_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1ConvolveYHighbdTest, + BuildHighbdParams(av1_highbd_convolve_y_sr_sve2)); +#endif + ///////////////////////////////////////////////////////////////// // Single reference convolve-y IntraBC functions (high bit-depth) ///////////////////////////////////////////////////////////////// @@ -1183,8 +1207,9 @@ class AV1Convolve2DTest : public AV1ConvolveTest<convolve_2d_func> { public: void RunTest() { - for (int sub_x = 0; sub_x < 16; ++sub_x) { - for (int sub_y = 0; sub_y < 16; ++sub_y) { + // Do not test the no-op filter. + for (int sub_x = 1; sub_x < 16; ++sub_x) { + for (int sub_y = 1; sub_y < 16; ++sub_y) { for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) { for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) { if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) || @@ -1306,6 +1331,11 @@ BuildLowbdParams(av1_convolve_2d_sr_neon_i8mm)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1Convolve2DTest, + BuildLowbdParams(av1_convolve_2d_sr_sve2)); +#endif + ///////////////////////////////////////////////////////////////// // Single reference convolve-2D IntraBC functions (low bit-depth) ///////////////////////////////////////////////////////////////// @@ -1409,8 +1439,9 @@ : public AV1ConvolveTest<highbd_convolve_2d_func> { public: void RunTest() { - for (int sub_x = 0; sub_x < 16; ++sub_x) { - for (int sub_y = 0; sub_y < 16; ++sub_y) { + // Do not test the no-op filter. + for (int sub_x = 1; sub_x < 16; ++sub_x) { + for (int sub_y = 1; sub_y < 16; ++sub_y) { for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) { for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) { if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) || @@ -1523,6 +1554,11 @@ BuildHighbdParams(av1_highbd_convolve_2d_sr_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1Convolve2DHighbdTest, + BuildHighbdParams(av1_highbd_convolve_2d_sr_sve2)); +#endif + ////////////////////////////////////////////////////////////////// // Single reference convolve-2d IntraBC functions (high bit-depth) ////////////////////////////////////////////////////////////////// @@ -1756,7 +1792,8 @@ public: void RunTest() { auto compound_params = GetCompoundParams(); - for (int sub_pix = 0; sub_pix < 16; ++sub_pix) { + // Do not test the no-op filter. + for (int sub_pix = 1; sub_pix < 16; ++sub_pix) { for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) { for (const auto &c : compound_params) { TestConvolve(sub_pix, static_cast<InterpFilter>(f), c); @@ -1858,7 +1895,8 @@ public: void RunTest() { auto compound_params = GetCompoundParams(); - for (int sub_pix = 0; sub_pix < 16; ++sub_pix) { + // Do not test the no-op filter. + for (int sub_pix = 1; sub_pix < 16; ++sub_pix) { for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) { for (const auto &c : compound_params) { TestConvolve(sub_pix, static_cast<InterpFilter>(f), c); @@ -1943,6 +1981,12 @@ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P( + SVE2, AV1ConvolveXHighbdCompoundTest, + BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sve2)); +#endif + #endif // CONFIG_AV1_HIGHBITDEPTH //////////////////////////////////////////////// @@ -2023,6 +2067,12 @@ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P( + SVE2, AV1ConvolveYHighbdCompoundTest, + BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sve2)); +#endif + #endif // CONFIG_AV1_HIGHBITDEPTH ////////////////////////////////////////////////////// @@ -2245,8 +2295,9 @@ auto compound_params = GetCompoundParams(); for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) { for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) { - for (int sub_x = 0; sub_x < 16; ++sub_x) { - for (int sub_y = 0; sub_y < 16; ++sub_y) { + // Do not test the no-op filter. + for (int sub_x = 1; sub_x < 16; ++sub_x) { + for (int sub_y = 1; sub_y < 16; ++sub_y) { for (const auto &compound : compound_params) { TestConvolve(static_cast<InterpFilter>(h_f), static_cast<InterpFilter>(v_f), sub_x, sub_y, @@ -2312,11 +2363,6 @@ INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCompoundTest, BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_c)); -#if HAVE_SSE2 -INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DCompoundTest, - BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_sse2)); -#endif - #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DCompoundTest, BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_ssse3)); @@ -2356,8 +2402,9 @@ auto compound_params = GetCompoundParams(); for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) { for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) { - for (int sub_x = 0; sub_x < 16; ++sub_x) { - for (int sub_y = 0; sub_y < 16; ++sub_y) { + // Do not test the no-op filter. + for (int sub_x = 1; sub_x < 16; ++sub_x) { + for (int sub_y = 1; sub_y < 16; ++sub_y) { for (const auto &compound : compound_params) { TestConvolve(static_cast<InterpFilter>(h_f), static_cast<InterpFilter>(v_f), sub_x, sub_y, @@ -2442,6 +2489,12 @@ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P( + SVE2, AV1Convolve2DHighbdCompoundTest, + BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sve2)); +#endif + #endif // CONFIG_AV1_HIGHBITDEPTH } // namespace
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc index 2ed5d94..4a5a634 100644 --- a/test/av1_fwd_txfm2d_test.cc +++ b/test/av1_fwd_txfm2d_test.cc
@@ -443,7 +443,7 @@ using ::testing::Values; using ::testing::ValuesIn; -#if HAVE_SSE2 +#if AOM_ARCH_X86 && HAVE_SSE2 static TX_SIZE fwd_txfm_for_sse2[] = { TX_4X4, TX_8X8, @@ -469,15 +469,14 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest, Combine(ValuesIn(fwd_txfm_for_sse2), Values(av1_lowbd_fwd_txfm_sse2))); -#endif // HAVE_SSE2 +#endif // AOM_ARCH_X86 && HAVE_SSE2 #if HAVE_SSE4_1 -static TX_SIZE fwd_txfm_for_sse41[] = { - TX_4X4, - TX_64X64, - TX_32X64, - TX_64X32, -}; +static TX_SIZE fwd_txfm_for_sse41[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, + TX_64X64, TX_4X8, TX_8X4, TX_8X16, + TX_16X8, TX_16X32, TX_32X16, TX_32X64, + TX_64X32, TX_4X16, TX_16X4, TX_8X32, + TX_32X8, TX_16X64, TX_64X16 }; INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest, Combine(ValuesIn(fwd_txfm_for_sse41),
diff --git a/test/av1_txfm_test.cc b/test/av1_txfm_test.cc index 77c0ec1..23e260b 100644 --- a/test/av1_txfm_test.cc +++ b/test/av1_txfm_test.cc
@@ -116,7 +116,7 @@ double Sqrt2 = pow(2, 0.5); double invSqrt2 = 1 / pow(2, 0.5); -double dct_matrix(double n, double k, int size) { +static double dct_matrix(double n, double k, int size) { return cos(PI * (2 * n + 1) * k / (2 * size)); } @@ -207,7 +207,7 @@ } } -void reference_idtx_1d(const double *in, double *out, int size) { +static void reference_idtx_1d(const double *in, double *out, int size) { double scale = 0; if (size == 4) scale = Sqrt2;
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc index 1055ff3..2234561 100644 --- a/test/av1_wedge_utils_test.cc +++ b/test/av1_wedge_utils_test.cc
@@ -408,4 +408,16 @@ av1_wedge_compute_delta_squares_avx2))); #endif // HAVE_AVX2 +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, WedgeUtilsSSEOptTest, + ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c, + av1_wedge_sse_from_residuals_sve))); + +INSTANTIATE_TEST_SUITE_P( + SVE, WedgeUtilsSignOptTest, + ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c, + av1_wedge_sign_from_residuals_sve))); +#endif // HAVE_SVE + } // namespace
diff --git a/test/avg_test.cc b/test/avg_test.cc index d7817a8..6f4c2ff 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc
@@ -1021,6 +1021,15 @@ make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon))); #endif +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VectorVarTest, + ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sve), + make_tuple(3, &aom_vector_var_c, &aom_vector_var_sve), + make_tuple(4, &aom_vector_var_c, &aom_vector_var_sve), + make_tuple(5, &aom_vector_var_c, &aom_vector_var_sve))); +#endif // HAVE_SVE + #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, VectorVarTest,
diff --git a/test/binary_codes_test.cc b/test/binary_codes_test.cc index 45660cf..2c2dfb4 100644 --- a/test/binary_codes_test.cc +++ b/test/binary_codes_test.cc
@@ -59,7 +59,7 @@ } } } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc index 17a9aa7..52c58e0 100644 --- a/test/boolcoder_test.cc +++ b/test/boolcoder_test.cc
@@ -66,7 +66,7 @@ aom_write(&bw, bit, static_cast<int>(probas[i])); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); @@ -100,7 +100,7 @@ for (int i = 0; i < kSymbols; i++) { aom_write(&bw, 0, p); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); uint32_t last_tell = aom_reader_tell(&br); @@ -146,7 +146,7 @@ for (int i = 0; i < kSymbols; i++) { aom_write(&bw, 1, p); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); ASSERT_FALSE(aom_reader_has_overflowed(&br));
diff --git a/test/cdef_test.cc b/test/cdef_test.cc index 5959dab..ac0591f 100644 --- a/test/cdef_test.cc +++ b/test/cdef_test.cc
@@ -441,10 +441,11 @@ constexpr int stride = MAX_CDEF_BLOCK; int error = 0; for (int k = 0; k < kIterations && !error; k++) { - // Generate a random value between 1 and 256, making sure height is even. - // Test once for very small values to avoid potential overflows. - const int width = k == 0 ? 2 : rnd_.Rand8() % 256 + 1; - const int height = k == 0 ? 2 : (rnd_.Rand8() % 128 + 1) * 2; + // This function operates on values of width that are either 4 or a + // multiple of 8. For height, generate a random value between 1 and 256, + // making sure it is even. + const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8; + const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2; for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { src_[i * stride + j] = rnd_.Rand8(); @@ -524,10 +525,11 @@ constexpr int stride = MAX_CDEF_BLOCK; int error = 0; for (int k = 0; k < kIterations && !error; k++) { - // Generate a random value between 1 and 256, making sure height is even. - // Test once for very small values to avoid potential overflows. - const int width = k == 0 ? 2 : rnd_.Rand8() % 256 + 1; - const int height = k == 0 ? 2 : (rnd_.Rand8() % 128 + 1) * 2; + // This function operates on values of width that are either 4 or a + // multiple of 8. For height, generate a random value between 1 and 256, + // making sure it is even. + const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8; + const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2; for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { src_[i * stride + j] = rnd_.Rand16(); @@ -612,7 +614,7 @@ using std::make_tuple; -#if (HAVE_SSE2 || HAVE_SSSE3 || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON) +#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON) static const CdefFilterBlockFunctions kCdefFilterFuncC[] = { { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c, &cdef_filter_8_3_c } @@ -624,50 +626,7 @@ }; #endif -#if HAVE_SSE2 -static const CdefFilterBlockFunctions kCdefFilterFuncSse2[] = { - { &cdef_filter_8_0_sse2, &cdef_filter_8_1_sse2, &cdef_filter_8_2_sse2, - &cdef_filter_8_3_sse2 } -}; - -static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSse2[] = { - { &cdef_filter_16_0_sse2, &cdef_filter_16_1_sse2, &cdef_filter_16_2_sse2, - &cdef_filter_16_3_sse2 } -}; - -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFBlockTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2), - ::testing::ValuesIn(kCdefFilterFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Values(8))); -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFBlockHighbdTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2), - ::testing::ValuesIn(kCdefFilterHighbdFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Range(10, 13, 2))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest, - ::testing::Values(make_tuple(&cdef_find_dir_sse2, - &cdef_find_dir_c))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualTest, - ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2, - &cdef_find_dir_dual_c))); - -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFCopyRect8to16Test, - ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c, - &cdef_copy_rect8_8bit_to_16bit_sse2))); - -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFCopyRect16to16Test, - ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c, - &cdef_copy_rect8_16bit_to_16bit_sse2))); -#endif - -#if HAVE_SSSE3 +#if AOM_ARCH_X86 && HAVE_SSSE3 static const CdefFilterBlockFunctions kCdefFilterFuncSsse3[] = { { &cdef_filter_8_0_ssse3, &cdef_filter_8_1_ssse3, &cdef_filter_8_2_ssse3, &cdef_filter_8_3_ssse3 } @@ -841,30 +800,7 @@ #endif // Test speed for all supported architectures -#if HAVE_SSE2 -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFSpeedTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2), - ::testing::ValuesIn(kCdefFilterFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Values(8))); -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFSpeedHighbdTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2), - ::testing::ValuesIn(kCdefFilterHighbdFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Values(10))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest, - ::testing::Values(make_tuple(&cdef_find_dir_sse2, - &cdef_find_dir_c))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualSpeedTest, - ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2, - &cdef_find_dir_dual_c))); -#endif - -#if HAVE_SSSE3 +#if AOM_ARCH_X86 && HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, CDEFSpeedTest, ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
diff --git a/test/cnn_test.cc b/test/cnn_test.cc index 127ed3d..e5114b5 100644 --- a/test/cnn_test.cc +++ b/test/cnn_test.cc
@@ -2651,4 +2651,11 @@ &av1_cnn_convolve_no_maxpool_padding_valid_avx2))); #endif +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, CNNConvolveTest, + ::testing::Values(CNNConvolveTestFuncs( + &av1_cnn_convolve_no_maxpool_padding_valid_c, + &av1_cnn_convolve_no_maxpool_padding_valid_neon))); +#endif + } // namespace
diff --git a/test/convolve_test.cc b/test/convolve_test.cc index c97f814..41e838a 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc
@@ -474,7 +474,7 @@ ref = CONVERT_TO_BYTEPTR(ref16_); } int subpel_search; - for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + for (subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS; ++subpel_search) { for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { const InterpFilter filter = (InterpFilter)filter_bank; @@ -555,7 +555,7 @@ } if (axis) seed_val += 8; int subpel_search; - for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + for (subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS; ++subpel_search) { for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { @@ -687,7 +687,7 @@ void FiltersWontSaturateWhenAddedPairwise() { int subpel_search; - for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + for (subpel_search = USE_2_TAPS; subpel_search <= USE_8_TAPS; ++subpel_search) { for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { const InterpFilter filter = (InterpFilter)filter_bank; @@ -773,6 +773,17 @@ WRAP(convolve8_horiz_neon, 12) WRAP(convolve8_vert_neon, 12) #endif // HAVE_NEON + +#if HAVE_SVE +WRAP(convolve8_horiz_sve, 8) +WRAP(convolve8_vert_sve, 8) + +WRAP(convolve8_horiz_sve, 10) +WRAP(convolve8_vert_sve, 10) + +WRAP(convolve8_horiz_sve, 12) +WRAP(convolve8_vert_sve, 12) +#endif // HAVE_SVE #endif // CONFIG_AV1_HIGHBITDEPTH #undef WRAP @@ -832,12 +843,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, HighbdConvolveTest, ::testing::ValuesIn(kArrayHighbdConvolve_sse2)); #endif -const ConvolveFunctions convolve8_sse2(aom_convolve8_horiz_sse2, - aom_convolve8_vert_sse2, 0); -const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) }; - -INSTANTIATE_TEST_SUITE_P(SSE2, LowbdConvolveTest, - ::testing::ValuesIn(kArrayConvolve_sse2)); #endif #if HAVE_SSSE3 @@ -919,4 +924,22 @@ ::testing::ValuesIn(kArray_Convolve8_neon_i8mm)); #endif // HAVE_NEON_I8MM +#if HAVE_SVE +#if CONFIG_AV1_HIGHBITDEPTH +const ConvolveFunctions wrap_convolve8_sve(wrap_convolve8_horiz_sve_8, + wrap_convolve8_vert_sve_8, 8); +const ConvolveFunctions wrap_convolve10_sve(wrap_convolve8_horiz_sve_10, + wrap_convolve8_vert_sve_10, 10); +const ConvolveFunctions wrap_convolve12_sve(wrap_convolve8_horiz_sve_12, + wrap_convolve8_vert_sve_12, 12); +const ConvolveParam kArray_HighbdConvolve8_sve[] = { + ALL_SIZES_64(wrap_convolve8_sve), ALL_SIZES_64(wrap_convolve10_sve), + ALL_SIZES_64(wrap_convolve12_sve) +}; + +INSTANTIATE_TEST_SUITE_P(SVE, HighbdConvolveTest, + ::testing::ValuesIn(kArray_HighbdConvolve8_sve)); +#endif +#endif // HAVE_SVE + } // namespace
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc index 9733732..895c8ad 100644 --- a/test/corner_match_test.cc +++ b/test/corner_match_test.cc
@@ -27,13 +27,19 @@ using libaom_test::ACMRandom; -typedef double (*ComputeCrossCorrFunc)(const unsigned char *im1, int stride1, - int x1, int y1, const unsigned char *im2, - int stride2, int x2, int y2); +typedef bool (*ComputeMeanStddevFunc)(const unsigned char *frame, int stride, + int x, int y, double *mean, + double *one_over_stddev); +typedef double (*ComputeCorrFunc)(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2); using std::make_tuple; using std::tuple; -typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam; +typedef tuple<int, ComputeMeanStddevFunc, ComputeCorrFunc> CornerMatchParam; class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> { public: @@ -41,8 +47,11 @@ void SetUp() override; protected: - void RunCheckOutput(int run_times); - ComputeCrossCorrFunc target_func; + void GenerateInput(uint8_t *input1, uint8_t *input2, int w, int h, int mode); + void RunCheckOutput(); + void RunSpeedTest(); + ComputeMeanStddevFunc target_compute_mean_stddev_func; + ComputeCorrFunc target_compute_corr_func; libaom_test::ACMRandom rnd_; }; @@ -51,13 +60,87 @@ AV1CornerMatchTest::~AV1CornerMatchTest() = default; void AV1CornerMatchTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); - target_func = GET_PARAM(1); + target_compute_mean_stddev_func = GET_PARAM(1); + target_compute_corr_func = GET_PARAM(2); } -void AV1CornerMatchTest::RunCheckOutput(int run_times) { +void AV1CornerMatchTest::GenerateInput(uint8_t *input1, uint8_t *input2, int w, + int h, int mode) { + if (mode == 0) { + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + input1[i * w + j] = rnd_.Rand8(); + input2[i * w + j] = rnd_.Rand8(); + } + } else if (mode == 1) { + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + int v = rnd_.Rand8(); + input1[i * w + j] = v; + input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15); + } + } +} + +void AV1CornerMatchTest::RunCheckOutput() { const int w = 128, h = 128; - const int num_iters = 10000; - int i, j; + const int num_iters = 1000; + + std::unique_ptr<uint8_t[]> input1(new (std::nothrow) uint8_t[w * h]); + std::unique_ptr<uint8_t[]> input2(new (std::nothrow) uint8_t[w * h]); + ASSERT_NE(input1, nullptr); + ASSERT_NE(input2, nullptr); + + // Test the two extreme cases: + // i) Random data, should have correlation close to 0 + // ii) Linearly related data + noise, should have correlation close to 1 + int mode = GET_PARAM(0); + GenerateInput(&input1[0], &input2[0], w, h, mode); + + for (int i = 0; i < num_iters; ++i) { + int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w + 1 - MATCH_SZ); + int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h + 1 - MATCH_SZ); + int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w + 1 - MATCH_SZ); + int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h + 1 - MATCH_SZ); + + double c_mean1, c_one_over_stddev1, c_mean2, c_one_over_stddev2; + bool c_valid1 = aom_compute_mean_stddev_c(input1.get(), w, x1, y1, &c_mean1, + &c_one_over_stddev1); + bool c_valid2 = aom_compute_mean_stddev_c(input2.get(), w, x2, y2, &c_mean2, + &c_one_over_stddev2); + + double simd_mean1, simd_one_over_stddev1, simd_mean2, simd_one_over_stddev2; + bool simd_valid1 = target_compute_mean_stddev_func( + input1.get(), w, x1, y1, &simd_mean1, &simd_one_over_stddev1); + bool simd_valid2 = target_compute_mean_stddev_func( + input2.get(), w, x2, y2, &simd_mean2, &simd_one_over_stddev2); + + // Run the correlation calculation even if one of the "valid" flags is + // false, i.e. if one of the patches doesn't have enough variance. This is + // safe because any potential division by 0 is caught in + // aom_compute_mean_stddev(), and one_over_stddev is set to 0 instead. + // This causes aom_compute_correlation() to return 0, without causing a + // division by 0. + const double c_corr = aom_compute_correlation_c( + input1.get(), w, x1, y1, c_mean1, c_one_over_stddev1, input2.get(), w, + x2, y2, c_mean2, c_one_over_stddev2); + const double simd_corr = target_compute_corr_func( + input1.get(), w, x1, y1, c_mean1, c_one_over_stddev1, input2.get(), w, + x2, y2, c_mean2, c_one_over_stddev2); + + ASSERT_EQ(simd_valid1, c_valid1); + ASSERT_EQ(simd_valid2, c_valid2); + ASSERT_EQ(simd_mean1, c_mean1); + ASSERT_EQ(simd_one_over_stddev1, c_one_over_stddev1); + ASSERT_EQ(simd_mean2, c_mean2); + ASSERT_EQ(simd_one_over_stddev2, c_one_over_stddev2); + ASSERT_EQ(simd_corr, c_corr); + } +} + +void AV1CornerMatchTest::RunSpeedTest() { + const int w = 16, h = 16; + const int num_iters = 1000000; aom_usec_timer ref_timer, test_timer; std::unique_ptr<uint8_t[]> input1(new (std::nothrow) uint8_t[w * h]); @@ -69,76 +152,82 @@ // i) Random data, should have correlation close to 0 // ii) Linearly related data + noise, should have correlation close to 1 int mode = GET_PARAM(0); - if (mode == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - input1[i * w + j] = rnd_.Rand8(); - input2[i * w + j] = rnd_.Rand8(); - } - } else if (mode == 1) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int v = rnd_.Rand8(); - input1[i * w + j] = v; - input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15); - } + GenerateInput(&input1[0], &input2[0], w, h, mode); + + // Time aom_compute_mean_stddev() + double c_mean1, c_one_over_stddev1, c_mean2, c_one_over_stddev2; + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_iters; i++) { + aom_compute_mean_stddev_c(input1.get(), w, 0, 0, &c_mean1, + &c_one_over_stddev1); + aom_compute_mean_stddev_c(input2.get(), w, 0, 0, &c_mean2, + &c_one_over_stddev2); } + aom_usec_timer_mark(&ref_timer); + int elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); - for (i = 0; i < num_iters; ++i) { - int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2); - int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2); - int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2); - int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2); - - double res_c = av1_compute_cross_correlation_c(input1.get(), w, x1, y1, - input2.get(), w, x2, y2); - double res_simd = - target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2); - - if (run_times > 1) { - aom_usec_timer_start(&ref_timer); - for (j = 0; j < run_times; j++) { - av1_compute_cross_correlation_c(input1.get(), w, x1, y1, input2.get(), - w, x2, y2); - } - aom_usec_timer_mark(&ref_timer); - const int elapsed_time_c = - static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); - - aom_usec_timer_start(&test_timer); - for (j = 0; j < run_times; j++) { - target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2); - } - aom_usec_timer_mark(&test_timer); - const int elapsed_time_simd = - static_cast<int>(aom_usec_timer_elapsed(&test_timer)); - - printf( - "c_time=%d \t simd_time=%d \t " - "gain=%d\n", - elapsed_time_c, elapsed_time_simd, - (elapsed_time_c / elapsed_time_simd)); - } else { - ASSERT_EQ(res_simd, res_c); - } + double simd_mean1, simd_one_over_stddev1, simd_mean2, simd_one_over_stddev2; + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_iters; i++) { + target_compute_mean_stddev_func(input1.get(), w, 0, 0, &simd_mean1, + &simd_one_over_stddev1); + target_compute_mean_stddev_func(input2.get(), w, 0, 0, &simd_mean2, + &simd_one_over_stddev2); } + aom_usec_timer_mark(&test_timer); + int elapsed_time_simd = static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + "aom_compute_mean_stddev(): c_time=%6d simd_time=%6d " + "gain=%.3f\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / (double)elapsed_time_simd)); + + // Time aom_compute_correlation + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_iters; i++) { + aom_compute_correlation_c(input1.get(), w, 0, 0, c_mean1, + c_one_over_stddev1, input2.get(), w, 0, 0, + c_mean2, c_one_over_stddev2); + } + aom_usec_timer_mark(&ref_timer); + elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_iters; i++) { + target_compute_corr_func(input1.get(), w, 0, 0, c_mean1, c_one_over_stddev1, + input2.get(), w, 0, 0, c_mean2, + c_one_over_stddev2); + } + aom_usec_timer_mark(&test_timer); + elapsed_time_simd = static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + "aom_compute_correlation(): c_time=%6d simd_time=%6d " + "gain=%.3f\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / (double)elapsed_time_simd)); } -TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); } -TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); } +TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(); } +TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunSpeedTest(); } #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, AV1CornerMatchTest, - ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_sse4_1), - make_tuple(1, &av1_compute_cross_correlation_sse4_1))); + ::testing::Values(make_tuple(0, &aom_compute_mean_stddev_sse4_1, + &aom_compute_correlation_sse4_1), + make_tuple(1, &aom_compute_mean_stddev_sse4_1, + &aom_compute_correlation_sse4_1))); #endif #if HAVE_AVX2 INSTANTIATE_TEST_SUITE_P( AVX2, AV1CornerMatchTest, - ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_avx2), - make_tuple(1, &av1_compute_cross_correlation_avx2))); + ::testing::Values(make_tuple(0, &aom_compute_mean_stddev_avx2, + &aom_compute_correlation_avx2), + make_tuple(1, &aom_compute_mean_stddev_avx2, + &aom_compute_correlation_avx2))); #endif } // namespace AV1CornerMatch
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index b5f5d29..972d800 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc
@@ -107,7 +107,7 @@ ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3); cfg_.g_timebase = video.timebase(); cfg_.rc_2pass_vbr_minsection_pct = 5; - cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; cfg_.rc_target_bitrate = 2000; cfg_.rc_max_quantizer = 63; cfg_.rc_min_quantizer = 0;
diff --git a/test/datarate_test.cc b/test/datarate_test.cc index a75a72f..9b73f79 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc
@@ -162,7 +162,7 @@ const int bitrate_array[2] = { 250, 650 }; cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; ResetModel(); - tile_column_ = 2; + tile_columns_ = 2; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate), effective_datarate_ * 0.85) @@ -354,7 +354,7 @@ const int bitrate_array[2] = { 250, 650 }; cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; ResetModel(); - tile_column_ = 1; + tile_columns_ = 1; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate), effective_datarate_ * 0.85)
diff --git a/test/datarate_test.h b/test/datarate_test.h index accc1ad..869c221 100644 --- a/test/datarate_test.h +++ b/test/datarate_test.h
@@ -42,7 +42,8 @@ bits_total_ = 0; denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; - tile_column_ = 0; + tile_columns_ = 0; + tile_rows_ = 0; screen_mode_ = false; max_perc_spike_ = 1.0; max_perc_spike_high_ = 1.0; @@ -62,7 +63,8 @@ if (video->frame() == 0) { encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); - encoder->Control(AV1E_SET_TILE_COLUMNS, tile_column_); + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_); + encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_); encoder->Control(AV1E_SET_ROW_MT, 1); if (cfg_.g_usage == AOM_USAGE_REALTIME) { encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); @@ -203,7 +205,8 @@ int denoiser_offon_period_; unsigned int aq_mode_; bool speed_change_test_; - int tile_column_; + int tile_columns_; + int tile_rows_; bool screen_mode_; double max_perc_spike_; double max_perc_spike_high_;
diff --git a/test/disflow_test.cc b/test/disflow_test.cc index 124c9a9..bee9e12 100644 --- a/test/disflow_test.cc +++ b/test/disflow_test.cc
@@ -114,9 +114,19 @@ ::testing::Values(aom_compute_flow_at_point_sse4_1)); #endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, ComputeFlowTest, + ::testing::Values(aom_compute_flow_at_point_avx2)); +#endif + #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest, ::testing::Values(aom_compute_flow_at_point_neon)); #endif +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, ComputeFlowTest, + ::testing::Values(aom_compute_flow_at_point_sve)); +#endif + } // namespace
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc index 3135d2a..50d5320 100644 --- a/test/dr_prediction_test.cc +++ b/test/dr_prediction_test.cc
@@ -10,6 +10,7 @@ */ #include <tuple> +#include <vector> #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -18,6 +19,7 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" +#include "aom_ports/sanitizer.h" #include "av1/common/blockd.h" #include "av1/common/pred_common.h" #include "av1/common/reconintra.h" @@ -27,6 +29,9 @@ namespace { +const int kNumIntraNeighbourPixels = MAX_TX_SIZE * 2 + 32; +const int kIntraPredInputPadding = 16; + const int kZ1Start = 0; const int kZ2Start = 90; const int kZ3Start = 180; @@ -149,10 +154,6 @@ protected: static const int kMaxNumTests = 10000; static const int kIterations = 10; - static const int kDstStride = 64; - static const int kDstSize = kDstStride * kDstStride; - static const int kOffset = 16; - static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16; DrPredTest() : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0), @@ -160,27 +161,12 @@ params_ = this->GetParam(); start_angle_ = params_.start_angle; stop_angle_ = start_angle_ + 90; - - dst_ref_ = &dst_ref_data_[0]; - dst_tst_ = &dst_tst_data_[0]; - dst_stride_ = kDstStride; - above_ = &above_data_[kOffset]; - left_ = &left_data_[kOffset]; - - for (int i = 0; i < kBufSize; ++i) { - above_data_[i] = rng_.Rand8(); - left_data_[i] = rng_.Rand8(); - } - - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = 0; - } } ~DrPredTest() override = default; - void Predict(bool speedtest, int tx) { + void Predict(bool speedtest, int tx, const Pixel *above, const Pixel *left, + Pixel *dst_ref, Pixel *dst_tst, int dst_stride) { const int kNumTests = speedtest ? kMaxNumTests : 1; aom_usec_timer timer; int tst_time = 0; @@ -189,7 +175,7 @@ aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { - params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_, + params_.ref_fn(dst_ref, dst_stride, bw_, bh_, above, left, upsample_above_, upsample_left_, dx_, dy_, bd_); } aom_usec_timer_mark(&timer); @@ -198,15 +184,17 @@ if (params_.tst_fn) { aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { - API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_, - above_, left_, upsample_above_, + API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst, dst_stride, bw_, bh_, + above, left, upsample_above_, upsample_left_, dx_, dy_, bd_)); } aom_usec_timer_mark(&timer); tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); } else { - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = dst_tst_[i]; + for (int r = 0; r < bh_; ++r) { + for (int c = 0; c < bw_; ++c) { + dst_tst[r * dst_stride + c] = dst_ref[r * dst_stride + c]; + } } } @@ -216,24 +204,7 @@ void RunTest(bool speedtest, bool needsaturation, int p_angle) { bd_ = params_.bit_depth; - if (needsaturation) { - for (int i = 0; i < kBufSize; ++i) { - above_data_[i] = left_data_[i] = (1 << bd_) - 1; - } - } for (int tx = 0; tx < TX_SIZES_ALL; ++tx) { - if (params_.tst_fn == nullptr) { - for (int i = 0; i < kDstSize; ++i) { - dst_tst_[i] = (1 << bd_) - 1; - dst_ref_[i] = (1 << bd_) - 1; - } - } else { - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = 0; - } - } - bw_ = tx_size_wide[kTxSize[tx]]; bh_ = tx_size_high[kTxSize[tx]]; @@ -246,12 +217,54 @@ upsample_above_ = upsample_left_ = 0; } - Predict(speedtest, tx); + // Declare input buffers as local arrays to allow checking for + // over-reads. + DECLARE_ALIGNED(16, Pixel, left_data[kNumIntraNeighbourPixels]); + DECLARE_ALIGNED(16, Pixel, above_data[kNumIntraNeighbourPixels]); + + // We need to allow reading some previous bytes from the input pointers. + const Pixel *above = &above_data[kIntraPredInputPadding]; + const Pixel *left = &left_data[kIntraPredInputPadding]; + + if (needsaturation) { + const Pixel sat = (1 << bd_) - 1; + for (int i = 0; i < kNumIntraNeighbourPixels; ++i) { + left_data[i] = sat; + above_data[i] = sat; + } + } else { + for (int i = 0; i < kNumIntraNeighbourPixels; ++i) { + left_data[i] = rng_.Rand8(); + above_data[i] = rng_.Rand8(); + } + } + + // Add additional padding to allow detection of over reads/writes when + // the transform width is equal to MAX_TX_SIZE. + const int dst_stride = MAX_TX_SIZE + 16; + std::vector<Pixel> dst_ref(dst_stride * bh_); + std::vector<Pixel> dst_tst(dst_stride * bh_); + + for (int r = 0; r < bh_; ++r) { + ASAN_POISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + ASAN_POISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + } + + Predict(speedtest, tx, above, left, dst_ref.data(), dst_tst.data(), + dst_stride); + + for (int r = 0; r < bh_; ++r) { + ASAN_UNPOISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + ASAN_UNPOISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + } for (int r = 0; r < bh_; ++r) { for (int c = 0; c < bw_; ++c) { - ASSERT_EQ(dst_ref_[r * dst_stride_ + c], - dst_tst_[r * dst_stride_ + c]) + ASSERT_EQ(dst_ref[r * dst_stride + c], dst_tst[r * dst_stride + c]) << bw_ << "x" << bh_ << " r: " << r << " c: " << c << " dx: " << dx_ << " dy: " << dy_ << " upsample_above: " << upsample_above_ @@ -292,19 +305,6 @@ } } - Pixel dst_ref_data_[kDstSize]; - Pixel dst_tst_data_[kDstSize]; - - Pixel left_data_[kBufSize]; - Pixel dummy_data_[kBufSize]; - Pixel above_data_[kBufSize]; - - Pixel *dst_ref_; - Pixel *dst_tst_; - Pixel *above_; - Pixel *left_; - int dst_stride_; - int enable_upsample_; int upsample_above_; int upsample_left_; @@ -386,6 +386,33 @@ TEST_P(LowbdDrPredTest, DISABLED_Speed) { RundrPredTest(1); } +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(HighbdDrPredTest, OperationCheck) { + if (params_.tst_fn == nullptr) return; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; angle++) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, false, angle); + } + } +} + +TEST_P(HighbdDrPredTest, DISABLED_Speed) { + const int angles[] = { 3, 45, 87 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int i = 0; i < 3; ++i) { + int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, false, angle); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, LowbdDrPredTest, @@ -453,32 +480,6 @@ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>, &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>, AOM_BITS_12, kZ3Start))); - -TEST_P(HighbdDrPredTest, DISABLED_Speed) { - const int angles[] = { 3, 45, 87 }; - for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { - for (int i = 0; i < 3; ++i) { - int angle = angles[i] + start_angle_; - dx_ = av1_get_dx(angle); - dy_ = av1_get_dy(angle); - printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", - enable_upsample_, angle); - if (dx_ && dy_) RunTest(true, false, angle); - } - } -} - -TEST_P(HighbdDrPredTest, OperationCheck) { - if (params_.tst_fn == nullptr) return; - // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 }; - for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { - for (int angle = start_angle_; angle < stop_angle_; angle++) { - dx_ = av1_get_dx(angle); - dy_ = av1_get_dy(angle); - if (dx_ && dy_) RunTest(false, false, angle); - } - } -} #endif // CONFIG_AV1_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -495,6 +496,47 @@ &z3_wrapper<av1_dr_prediction_z3_neon>, AOM_BITS_8, kZ3Start))); +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, HighbdDrPredTest, + ::testing::Values(DrPredFunc<DrPred_Hbd>( + &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>, + &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>, + AOM_BITS_8, kZ1Start), + DrPredFunc<DrPred_Hbd>( + &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>, + &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>, + AOM_BITS_10, kZ1Start), + DrPredFunc<DrPred_Hbd>( + &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>, + &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>, + AOM_BITS_12, kZ1Start), + DrPredFunc<DrPred_Hbd>( + &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>, + &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_neon>, + AOM_BITS_8, kZ2Start), + DrPredFunc<DrPred_Hbd>( + &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>, + &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_neon>, + AOM_BITS_10, kZ2Start), + DrPredFunc<DrPred_Hbd>( + &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>, + &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_neon>, + AOM_BITS_12, kZ2Start), + DrPredFunc<DrPred_Hbd>( + &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>, + &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>, + AOM_BITS_8, kZ3Start), + DrPredFunc<DrPred_Hbd>( + &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>, + &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>, + AOM_BITS_10, kZ3Start), + DrPredFunc<DrPred_Hbd>( + &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>, + &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>, + AOM_BITS_12, kZ3Start))); +#endif // CONFIG_AV1_HIGHBITDEPTH + #endif // HAVE_NEON } // namespace
diff --git a/test/ec_test.cc b/test/ec_test.cc index e0555b4..a5284de 100644 --- a/test/ec_test.cc +++ b/test/ec_test.cc
@@ -78,6 +78,7 @@ tell[j + 1] = od_ec_enc_tell_frac(&enc); } ptr = od_ec_enc_done(&enc, &ptr_sz); + ASSERT_NE(ptr, nullptr); EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz) << "od_ec_enc_tell() lied: " "there's " @@ -143,6 +144,7 @@ od_ec_enc_patch_initial_bits(&enc, 0, 2); EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n"; ptr = od_ec_enc_done(&enc, &ptr_sz); + ASSERT_NE(ptr, nullptr); EXPECT_EQ(ptr_sz, 2u); EXPECT_EQ(ptr[0], 63) << "Got " << ptr[0]
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index b48c5a2..379d8d6 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc
@@ -10,6 +10,8 @@ */ #include <cassert> +#include <climits> +#include <cstdint> #include <cstdlib> #include <cstring> #include <tuple> @@ -556,6 +558,83 @@ encoder.Encode(false); } +TEST(EncodeAPI, PtsSmallerThanInitialPts) { + // Initialize libaom encoder. + aom_codec_iface_t *const iface = aom_codec_av1_cx(); + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME), + AOM_CODEC_OK); + + cfg.g_w = 1280; + cfg.g_h = 720; + cfg.rc_target_bitrate = 1000; + + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + // Create input image. + aom_image_t *const image = + CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(aom_codec_encode(&enc, image, 12, 1, 0), AOM_CODEC_OK); + ASSERT_EQ(aom_codec_encode(&enc, image, 13, 1, 0), AOM_CODEC_OK); + // pts (10) is smaller than the initial pts (12). + ASSERT_EQ(aom_codec_encode(&enc, image, 10, 1, 0), AOM_CODEC_INVALID_PARAM); + + // Free resources. + aom_img_free(image); + aom_codec_destroy(&enc); +} + +TEST(EncodeAPI, PtsOrDurationTooBig) { + // Initialize libaom encoder. + aom_codec_iface_t *const iface = aom_codec_av1_cx(); + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME), + AOM_CODEC_OK); + + cfg.g_w = 1280; + cfg.g_h = 720; + cfg.rc_target_bitrate = 1000; + + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + // Create input image. + aom_image_t *const image = + CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK); + // pts, when converted to ticks, is too big. + ASSERT_EQ(aom_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0), + AOM_CODEC_INVALID_PARAM); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ(aom_codec_encode(&enc, image, 0, (1ul << 63), 0), + AOM_CODEC_INVALID_PARAM); + // pts + duration is too big. + ASSERT_EQ(aom_codec_encode(&enc, image, 1, INT64_MAX, 0), + AOM_CODEC_INVALID_PARAM); +#endif + // pts + duration, when converted to ticks, is too big. +#if ULONG_MAX > INT64_MAX + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 0x1c0a0a1a3232, 0), + AOM_CODEC_INVALID_PARAM); +#endif + ASSERT_EQ(aom_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0), + AOM_CODEC_INVALID_PARAM); + + // Free resources. + aom_img_free(image); + aom_codec_destroy(&enc); +} + // Reproduces https://crbug.com/339877165. TEST(EncodeAPI, Buganizer339877165) { // Initialize libaom encoder. @@ -697,6 +776,76 @@ aom_codec_destroy(&enc); } +TEST(EncodeAPI, AomediaIssue3509VbrMinSection2Percent) { + // Initialize libaom encoder. + aom_codec_iface_t *const iface = aom_codec_av1_cx(); + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME), + AOM_CODEC_OK); + + cfg.g_w = 1920; + cfg.g_h = 1080; + cfg.rc_target_bitrate = 1000000; + // Set this to more than 1 percent to cause a signed integer overflow in the + // multiplication rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section in + // av1_rc_update_framerate() if the multiplication is done in the `int` type. + cfg.rc_2pass_vbr_minsection_pct = 2; + + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + // Create input image. + aom_image_t *const image = + CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + // `duration` can go as high as 300, but the UBSan error is gone if + // `duration` is 301 or higher. + ASSERT_EQ(aom_codec_encode(&enc, image, 0, /*duration=*/300, 0), + AOM_CODEC_OK); + + // Free resources. + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + +TEST(EncodeAPI, AomediaIssue3509VbrMinSection101Percent) { + // Initialize libaom encoder. + aom_codec_iface_t *const iface = aom_codec_av1_cx(); + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME), + AOM_CODEC_OK); + + cfg.g_w = 1920; + cfg.g_h = 1080; + cfg.rc_target_bitrate = 1000000; + // Set this to more than 100 percent to cause an error when vbr_min_bits is + // cast to `int` in av1_rc_update_framerate() if vbr_min_bits is not clamped + // to INT_MAX. + cfg.rc_2pass_vbr_minsection_pct = 101; + + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + // Create input image. + aom_image_t *const image = + CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + // `duration` can go as high as 300, but the UBSan error is gone if + // `duration` is 301 or higher. + ASSERT_EQ(aom_codec_encode(&enc, image, 0, /*duration=*/300, 0), + AOM_CODEC_OK); + + // Free resources. + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + class EncodeAPIParameterized : public testing::TestWithParam<std::tuple< /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {}; @@ -796,6 +945,32 @@ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0)); } +TEST(EncodeAPI, AllIntraAndUsePsnr) { + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA), + AOM_CODEC_OK); + + aom_codec_ctx_t enc; + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_PSNR), + AOM_CODEC_OK); + + aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK); + const aom_codec_cx_pkt_t *pkt; + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) { + ASSERT_EQ(pkt->kind, AOM_CODEC_PSNR_PKT); + } + } + + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + // A test that reproduces bug aomedia:3534. TEST(EncodeAPI, AllIntraAndNoRefLast) { aom_codec_iface_t *iface = aom_codec_av1_cx();
diff --git a/test/error_block_test.cc b/test/error_block_test.cc index 176efdf..e7cd870 100644 --- a/test/error_block_test.cc +++ b/test/error_block_test.cc
@@ -245,7 +245,7 @@ using std::make_tuple; -#if (HAVE_SSE2) +#if HAVE_SSE2 const ErrorBlockParam kErrorBlockTestParamsSse2[] = { #if CONFIG_AV1_HIGHBITDEPTH make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c, @@ -265,7 +265,7 @@ ::testing::ValuesIn(kErrorBlockTestParamsSse2)); #endif // HAVE_SSE2 -#if (HAVE_AVX2) +#if HAVE_AVX2 const ErrorBlockParam kErrorBlockTestParamsAvx2[] = { #if CONFIG_AV1_HIGHBITDEPTH make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c, @@ -285,7 +285,7 @@ ::testing::ValuesIn(kErrorBlockTestParamsAvx2)); #endif // HAVE_AVX2 -#if (HAVE_NEON) +#if HAVE_NEON const ErrorBlockParam kErrorBlockTestParamsNeon[] = { #if CONFIG_AV1_HIGHBITDEPTH make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c, @@ -304,4 +304,16 @@ INSTANTIATE_TEST_SUITE_P(NEON, ErrorBlockTest, ::testing::ValuesIn(kErrorBlockTestParamsNeon)); #endif // HAVE_NEON + +#if HAVE_SVE +const ErrorBlockParam kErrorBlockTestParamsSVE[] = { + make_tuple(&BlockError8BitWrapper<av1_block_error_sve>, + &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8), + make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_sve>, + &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SVE, ErrorBlockTest, + ::testing::ValuesIn(kErrorBlockTestParamsSVE)); +#endif // HAVE_SVE } // namespace
diff --git a/test/ethread_test.cc b/test/ethread_test.cc index ce45394..415f5de 100644 --- a/test/ethread_test.cc +++ b/test/ethread_test.cc
@@ -18,6 +18,7 @@ #include "test/util.h" #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/firstpass.h" namespace { @@ -411,9 +412,7 @@ const std::vector<size_t> ref_size_enc, const std::vector<std::string> ref_md5_enc, const std::vector<std::string> ref_md5_dec) { - // This value should be kept the same as MAX_NUM_THREADS - // in aom_thread.h - cfg_.g_threads = 64; + cfg_.g_threads = MAX_NUM_THREADS; ASSERT_NO_FATAL_FAILURE(RunLoop(video)); std::vector<size_t> multi_thr_max_row_mt_size_enc; std::vector<std::string> multi_thr_max_row_mt_md5_enc;
diff --git a/test/examples.sh b/test/examples.sh index 771a7b6..3e16123 100755 --- a/test/examples.sh +++ b/test/examples.sh
@@ -10,12 +10,13 @@ ## ## This file runs all of the tests for the libaom examples. ## +readonly EXEC_DIR="$(pwd)" . $(dirname $0)/tools_common.sh example_tests=$(ls -r $(dirname $0)/*.sh) # List of script names to exclude. -exclude_list="best_encode examples run_encodes tools_common av1_c_vs_simd_encode" +exclude_list="best_encode examples run_encodes tools_common" if [ "$(realtime_only_build)" = "yes" ]; then exclude_list="${exclude_list} twopass_encoder simple_decoder lightfield_test" @@ -30,4 +31,7 @@ # Source each test script so that exporting variables can be avoided. AOM_TEST_NAME="$(basename ${test%.*})" . "${test}" + # Restore the working directory to the one at the beginning of execution. + # This avoids side-effects from tests that change the directory. + cd "${EXEC_DIR}" done
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 8f16c4e..0bf0f6b 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc
@@ -1,11 +1,12 @@ /* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2018, Alliance for Open Media. All rights reserved * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <memory>
diff --git a/test/film_grain_table_test.cc b/test/film_grain_table_test.cc index 808d966..2c6906f 100644 --- a/test/film_grain_table_test.cc +++ b/test/film_grain_table_test.cc
@@ -20,6 +20,8 @@ #include "test/util.h" #include "test/video_source.h" +namespace { + void grain_equal(const aom_film_grain_t *expected, const aom_film_grain_t *actual) { EXPECT_EQ(expected->apply_grain, actual->apply_grain); @@ -73,6 +75,8 @@ } } +} // namespace + TEST(FilmGrainTableTest, AddAndLookupSingleSegment) { aom_film_grain_table_t table; memset(&table, 0, sizeof(table));
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc new file mode 100644 index 0000000..cff353a --- /dev/null +++ b/test/frame_resize_test.cc
@@ -0,0 +1,272 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <memory> +#include <new> + +#include "config/av1_rtcd.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/bitops.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +using ::testing::Combine; +using ::testing::Values; +using ::testing::ValuesIn; + +using std::make_tuple; +using std::tuple; + +const int kIters = 1000; + +typedef tuple<int, int> FrameDimension; + +// Check that two 8-bit output buffers are identical. +void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width, + int height) { + ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations"; + for (int j = 0; j < height; ++j) { + if (memcmp(p1, p2, sizeof(*p1) * width) == 0) { + p1 += width; + p2 += width; + continue; + } + for (int i = 0; i < width; ++i) { + ASSERT_EQ(p1[i], p2[i]) + << width << "x" << height << " Pixel mismatch at (" << i << ", " << j + << ")"; + } + } +} + +typedef bool (*LowBDResizeFunc)(uint8_t *intbuf, uint8_t *output, + int out_stride, int height, int height2, + int stride, int start_wd); +// Test parameter list: +// <tst_fun, dims> +typedef tuple<LowBDResizeFunc, FrameDimension> ResizeTestParams; + +class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> { + public: + void SetUp() { + test_fun_ = GET_PARAM(0); + frame_dim_ = GET_PARAM(1); + width_ = std::get<0>(frame_dim_); + height_ = std::get<1>(frame_dim_); + const int msb = get_msb(AOMMIN(width_, height_)); + n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + const int src_buf_size = (width_ / 2) * height_; + const int dest_buf_size = (width_ * height_) / 4; + src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]); + ASSERT_NE(src_, nullptr); + + ref_dest_ = + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(ref_dest_, nullptr); + + test_dest_ = + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(test_dest_, nullptr); + } + + void RunTest() { + for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8(); + for (int level = 1; level < n_levels_; level++) { + const int width2 = (width_ >> level); + const int height2 = (height_ >> level); + av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1, + height2, width2, 0); + test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2, + width2, 0); + + AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height2); + } + } + + void SpeedTest() { + for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8(); + for (int level = 1; level < n_levels_; level++) { + const int width2 = (width_ >> level); + const int height2 = (height_ >> level); + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < kIters; j++) { + av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1, + height2, width2, 0); + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (int j = 0; j < kIters; j++) { + test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2, + width2, 0); + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "level: " << level << " [" << width2 << " x " << height2 + << "] C time = " << ref_time << " , SIMD time = " << tst_time + << " scaling=" << float(1.00) * ref_time / tst_time << "x \n"; + } + } + + private: + LowBDResizeFunc test_fun_; + FrameDimension frame_dim_; + int width_; + int height_; + int n_levels_; + std::unique_ptr<uint8_t[]> src_; + std::unique_ptr<uint8_t[]> ref_dest_; + std::unique_ptr<uint8_t[]> test_dest_; + libaom_test::ACMRandom rng_; +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeYTest); + +TEST_P(AV1ResizeYTest, RunTest) { RunTest(); } + +TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); } + +#if HAVE_AVX2 || HAVE_SSE2 +// Resolutions (width x height) to be tested for resizing. +const FrameDimension kFrameDim[] = { + make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080), + make_tuple(1280, 720), make_tuple(640, 480), make_tuple(640, 360), + make_tuple(286, 286), make_tuple(284, 284), make_tuple(282, 282), + make_tuple(280, 280), make_tuple(262, 262), make_tuple(258, 258), + make_tuple(256, 256), make_tuple(34, 34), +}; +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1ResizeYTest, + ::testing::Combine(::testing::Values(av1_resize_vert_dir_avx2), + ::testing::ValuesIn(kFrameDim))); +#endif + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AV1ResizeYTest, + ::testing::Combine(::testing::Values(av1_resize_vert_dir_sse2), + ::testing::ValuesIn(kFrameDim))); +#endif + +typedef void (*LowBDResize_x_Func)(const uint8_t *const input, int in_stride, + uint8_t *intbuf, int height, + int filtered_length, int width2); + +typedef tuple<LowBDResize_x_Func, FrameDimension> Resize_x_TestParams; + +class AV1ResizeXTest : public ::testing::TestWithParam<Resize_x_TestParams> { + public: + void SetUp() { + test_fun_ = GET_PARAM(0); + frame_dim_ = GET_PARAM(1); + width_ = std::get<0>(frame_dim_); + height_ = std::get<1>(frame_dim_); + const int msb = get_msb(AOMMIN(width_, height_)); + n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + const int src_buf_size = width_ * height_; + const int dest_buf_size = (width_ * height_) / 2; + src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]); + ASSERT_NE(src_, nullptr); + + ref_dest_ = + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(ref_dest_, nullptr); + + test_dest_ = + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(test_dest_, nullptr); + } + + void RunTest() { + for (int i = 0; i < width_ * height_; ++i) src_[i] = rng_.Rand8(); + + for (int level = 1; level < n_levels_; ++level) { + const int width2 = (width_ >> level); + av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_, + width2 << 1, width2); + test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1, + width2); + AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height_); + } + } + + void SpeedTest() { + for (int i = 0; i < width_ * height_; ++i) src_[i] = rng_.Rand8(); + + for (int level = 1; level < n_levels_; ++level) { + const int width2 = (width_ >> level); + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < kIters; ++j) { + av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_, + width2 << 1, width2); + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (int j = 0; j < kIters; ++j) { + test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1, + width2); + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "level: " << level << " [" << width2 << " x " << height_ + << "] C time = " << ref_time << " , SIMD time = " << tst_time + << " scaling=" << float(1.00) * ref_time / tst_time << "x \n"; + } + } + + private: + LowBDResize_x_Func test_fun_; + FrameDimension frame_dim_; + int width_; + int height_; + int n_levels_; + std::unique_ptr<uint8_t[]> src_; + std::unique_ptr<uint8_t[]> ref_dest_; + std::unique_ptr<uint8_t[]> test_dest_; + libaom_test::ACMRandom rng_; +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeXTest); + +TEST_P(AV1ResizeXTest, RunTest) { RunTest(); } + +TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); } + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AV1ResizeXTest, + ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2), + ::testing::ValuesIn(kFrameDim))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1ResizeXTest, + ::testing::Combine(::testing::Values(av1_resize_horz_dir_avx2), + ::testing::ValuesIn(kFrameDim))); +#endif + +} // namespace
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc index 303d580..59bca9b 100644 --- a/test/hbd_metrics_test.cc +++ b/test/hbd_metrics_test.cc
@@ -23,7 +23,6 @@ #include "aom_dsp/psnr.h" #include "aom_dsp/ssim.h" #include "aom_ports/mem.h" -#include "aom_ports/msvc.h" #include "aom_scale/yv12config.h" using libaom_test::ACMRandom; @@ -112,10 +111,10 @@ memset(&hbd_src, 0, sizeof(hbd_src)); memset(&hbd_dst, 0, sizeof(hbd_dst)); - aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16, 0, 0); - aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16, 0, 0); - aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16, 0, 0); - aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16, 0, 0); + aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16, false, 0); + aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16, false, 0); + aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16, false, 0); + aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16, false, 0); memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz); while (i < lbd_src.buffer_alloc_sz) {
diff --git a/test/kf_test.cc b/test/kf_test.cc index bc475fd..7d8cbfe 100644 --- a/test/kf_test.cc +++ b/test/kf_test.cc
@@ -9,9 +9,14 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <string.h> + #include <ostream> #include "aom/aom_codec.h" +#include "aom/aom_encoder.h" +#include "aom/aom_image.h" +#include "aom/aomcx.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" @@ -21,6 +26,87 @@ #define NUM_LAG_VALUES 3 namespace { +aom_image_t *CreateGrayImage(aom_img_fmt_t fmt, unsigned int w, + unsigned int h) { + aom_image_t *const image = aom_img_alloc(nullptr, fmt, w, h, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + return image; +} + +// Tests kf_max_dist in one-pass encoding with zero lag. +void TestKeyFrameMaximumInterval(unsigned int usage, unsigned int kf_max_dist) { + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK); + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_pass = AOM_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = AOM_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = kf_max_dist; + + aom_codec_ctx_t enc; + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK); + + aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frames. + const aom_codec_cx_pkt_t *pkt; + const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1; + for (unsigned int i = 0; i < num_frames; ++i) { + ASSERT_EQ(aom_codec_encode(&enc, image, i, 1, 0), AOM_CODEC_OK); + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + if (kf_max_dist == 0 || i % kf_max_dist == 0) { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY); + } else { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u); + } + } + } + + // Flush the encoder. + bool got_data; + do { + ASSERT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK); + got_data = false; + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + got_data = true; + } + } while (got_data); + + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + +TEST(KeyFrameIntervalTest, KeyFrameMaximumInterval) { + for (unsigned int usage : { AOM_USAGE_GOOD_QUALITY, AOM_USAGE_REALTIME }) { + // Test 0 and 1 (both mean all intra), some powers of 2, some multiples of + // 10, and some prime numbers. + for (unsigned int kf_max_dist : + { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) { + TestKeyFrameMaximumInterval(usage, kf_max_dist); + } + } +} + typedef struct { const unsigned int min_kf_dist; const unsigned int max_kf_dist;
diff --git a/test/level_test.cc b/test/level_test.cc index a7c26d2..6d59f45 100644 --- a/test/level_test.cc +++ b/test/level_test.cc
@@ -135,12 +135,12 @@ // To save run time, we only test speed 4. if (cpu_used_ == 4) { libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 40); + 30, 1, 0, 30); target_level_ = kLevelKeepStats; cfg_.rc_target_bitrate = 1000; - cfg_.g_limit = 40; + cfg_.g_limit = 30; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(level_[0], 0); + ASSERT_LE(level_[0], 0); } } @@ -148,12 +148,12 @@ // To save run time, we only test speed 4. if (cpu_used_ == 4) { libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 40); + 30, 1, 0, 30); target_level_ = kLevelKeepStats; cfg_.rc_target_bitrate = 4000; - cfg_.g_limit = 40; + cfg_.g_limit = 30; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(level_[0], 4); + ASSERT_LE(level_[0], 4); } } @@ -166,7 +166,7 @@ target_level_ = target_level; cfg_.rc_target_bitrate = 4000; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(level_[0], target_level); + ASSERT_LE(level_[0], target_level); } }
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc index 534d9b1..04b6f45 100644 --- a/test/pickrst_test.cc +++ b/test/pickrst_test.cc
@@ -363,6 +363,12 @@ ::testing::Values(av1_highbd_pixel_proj_error_avx2)); #endif // HAVE_AVX2 +#if HAVE_NEON + +INSTANTIATE_TEST_SUITE_P(NEON, PixelProjHighbdErrorTest, + ::testing::Values(av1_highbd_pixel_proj_error_neon)); +#endif // HAVE_NEON + } // namespace pickrst_test_highbd #endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc index 328d5b1..61f26ea 100644 --- a/test/quantize_func_test.cc +++ b/test/quantize_func_test.cc
@@ -19,6 +19,7 @@ #include "config/av1_rtcd.h" #include "aom/aom_codec.h" +#include "aom_dsp/txfm_common.h" #include "aom_ports/aom_timer.h" #include "av1/encoder/encoder.h" #include "av1/common/scan.h" @@ -482,9 +483,9 @@ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2, static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8), make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2, - static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8), + static_cast<TX_SIZE>(TX_8X8), TYPE_FP, AOM_BITS_8), make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2, - static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8) + static_cast<TX_SIZE>(TX_4X4), TYPE_FP, AOM_BITS_8) }; INSTANTIATE_TEST_SUITE_P(AVX2, LowPrecisionQuantizeTest, @@ -704,9 +705,9 @@ make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon, static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8), make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon, - static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8), + static_cast<TX_SIZE>(TX_8X8), TYPE_FP, AOM_BITS_8), make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon, - static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8) + static_cast<TX_SIZE>(TX_4X4), TYPE_FP, AOM_BITS_8) }; INSTANTIATE_TEST_SUITE_P(NEON, LowPrecisionQuantizeTest,
diff --git a/test/resize_test.cc b/test/resize_test.cc index 7bad453..a84a465 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc
@@ -11,15 +11,16 @@ #include <climits> #include <vector> + +#include "aom/aomcx.h" #include "aom_dsp/aom_dsp_common.h" -#include "common/tools_common.h" #include "av1/encoder/encoder.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "test/video_source.h" #include "test/util.h" +#include "test/video_source.h" #include "test/y4m_video_source.h" // Enable(1) or Disable(0) writing of the compressed bitstream. @@ -403,7 +404,7 @@ ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)), set_scale_mode_(false), set_scale_mode2_(false), - set_scale_mode3_(false) {} + set_scale_mode3_(false), is_screen_(false) {} ~ResizeRealtimeTest() override = default; void PreEncodeFrameHook(libaom_test::VideoSource *video, @@ -415,6 +416,8 @@ encoder->Control(AV1E_SET_ENABLE_OBMC, 0); encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + if (is_screen_) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); } if (set_scale_mode_) { struct aom_scaling_mode mode; @@ -508,6 +511,7 @@ bool set_scale_mode_; bool set_scale_mode2_; bool set_scale_mode3_; + bool is_screen_; }; // Check the AOME_SET_SCALEMODE control by downsizing to @@ -685,6 +689,45 @@ } } +TEST_P(ResizeRealtimeTest, TestExternalResizeWorksUsePSNR) { + ResizingVideoSource video; + video.flag_codec_ = 1; + change_bitrate_ = false; + set_scale_mode_ = false; + set_scale_mode2_ = false; + set_scale_mode3_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.rc_dropframe_thresh = 30; + DefaultConfig(); + // Test external resizing with start resolution equal to + // 1. kInitialWidth and kInitialHeight + // 2. down-scaled kInitialWidth and kInitialHeight + for (int i = 0; i < 2; i++) { + video.change_start_resln_ = static_cast<bool>(i); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check we decoded the same number of frames as we attempted to encode + ASSERT_EQ(frame_info_list_.size(), video.limit()); + for (const auto &info : frame_info_list_) { + const unsigned int frame = static_cast<unsigned>(info.pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, + video.flag_codec_, video.change_start_resln_, + &expected_w, &expected_h); + EXPECT_EQ(expected_w, info.w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info.h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); + } + frame_info_list_.clear(); + } +} + // Verify the dynamic resizer behavior for real time, 1 pass CBR mode. // Run at low bitrate, with resize_allowed = 1, and verify that we get // one resize down event. @@ -740,6 +783,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + init_flags_ = AOM_CODEC_USE_PSNR; cfg_.g_w = 640; cfg_.g_h = 480; change_bitrate_ = true; @@ -795,6 +839,63 @@ #endif } +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode for +// screen content mode. Start at low target bitrate, raise the bitrate in the +// middle of the clip (at frame# = frame_change_bitrate_), scaling-up should +// occur after bitrate is increased. +TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRateScreen) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.g_w = 352; + cfg_.g_h = 288; + change_bitrate_ = true; + frame_change_bitrate_ = 120; + set_scale_mode_ = false; + set_scale_mode2_ = false; + set_scale_mode3_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + is_screen_ = true; + DefaultConfig(); + // Disable dropped frames. + cfg_.rc_dropframe_thresh = 0; + // Starting bitrate low. + cfg_.rc_target_bitrate = 100; + cfg_.rc_resize_mode = RESIZE_DYNAMIC; + cfg_.g_forced_max_frame_width = 1280; + cfg_.g_forced_max_frame_height = 1280; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + unsigned int frame_number = 0; + int resize_down_count = 0; + for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + if (info->w != last_w || info->h != last_h) { + if (frame_number < frame_change_bitrate_) { + // Verify that resize down occurs, before bitrate is increased. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + resize_down_count++; + } + last_w = info->w; + last_h = info->h; + } + frame_number++; + } + +#if CONFIG_AV1_DECODER + // Verify that we get at least 1 resize event in this test. + ASSERT_GE(resize_down_count, 1) + << "Resizing down should occur at lease once."; + EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); +#else + printf("Warning: AV1 decoder unavailable, unable to check resize count!\n"); +#endif +} + class ResizeCspTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM
diff --git a/test/sad_test.cc b/test/sad_test.cc index 5212748..64cf800 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc
@@ -3202,6 +3202,7 @@ make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1), make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1), + make_tuple(16, 4, &aom_sad_skip_16x4x4d_avx2, -1), #endif }; @@ -3294,6 +3295,7 @@ #if !CONFIG_REALTIME_ONLY make_tuple(32, 8, &aom_sad32x8x3d_avx2, -1), make_tuple(64, 16, &aom_sad64x16x3d_avx2, -1), + make_tuple(16, 4, &aom_sad16x4x3d_avx2, -1), #endif // !CONFIG_REALTIME_ONLY #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/scan_test.cc b/test/scan_test.cc index 571658e..3ba39de 100644 --- a/test/scan_test.cc +++ b/test/scan_test.cc
@@ -25,8 +25,8 @@ } } -int scan_order_test(const SCAN_ORDER *scan_order, int w, int h, - SCAN_MODE mode) { +static int scan_order_test(const SCAN_ORDER *scan_order, int w, int h, + SCAN_MODE mode) { const int16_t *scan = scan_order->scan; const int16_t *iscan = scan_order->iscan; int dim = w + h - 1;
diff --git a/test/segment_binarization_sync.cc b/test/segment_binarization_sync.cc index bd8cf11..108e66a 100644 --- a/test/segment_binarization_sync.cc +++ b/test/segment_binarization_sync.cc
@@ -10,15 +10,14 @@ */ #include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "av1/common/seg_common.h" +#include "av1/decoder/decodemv.h" +#include "av1/encoder/bitstream.h" #include "test/acm_random.h" using libaom_test::ACMRandom; -extern "C" { -int av1_neg_interleave(int x, int ref, int max); -int av1_neg_deinterleave(int diff, int ref, int max); -} - namespace { struct Segment { @@ -28,8 +27,6 @@ }; Segment GenerateSegment(int seed) { - static const int MAX_SEGMENTS = 8; - ACMRandom rnd_(seed); Segment segment;
diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc index 64465c8..054fbcc 100644 --- a/test/sharpness_test.cc +++ b/test/sharpness_test.cc
@@ -30,7 +30,7 @@ kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood), { { 2, { { 2, 37.6 }, { 5, 37.6 } } }, { 4, { { 2, 37.5 }, { 5, 37.5 } } }, - { 6, { { 2, 37.5 }, { 5, 37.5 } } } } }, + { 6, { { 2, 37.4 }, { 5, 37.4 } } } } }, { static_cast<int>(::libaom_test::kAllIntra), { { 3, { { 2, 42.2 }, { 5, 42.2 } } }, { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } },
diff --git a/test/simd_cmp_neon.cc b/test/simd_cmp_neon.cc deleted file mode 100644 index 53c1e2a..0000000 --- a/test/simd_cmp_neon.cc +++ /dev/null
@@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ -#define ARCH NEON -#define ARCH_POSTFIX(name) name##_neon -#define SIMD_NAMESPACE simd_test_neon -#include "test/simd_cmp_impl.h" -#endif
diff --git a/test/simd_neon_test.cc b/test/simd_neon_test.cc deleted file mode 100644 index b67b188..0000000 --- a/test/simd_neon_test.cc +++ /dev/null
@@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ -#define ARCH NEON -#define ARCH_POSTFIX(name) name##_neon -#define SIMD_NAMESPACE simd_test_neon -#include "test/simd_impl.h" -#endif
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc index 70d8da5..fd6fb88 100644 --- a/test/sse_sum_test.cc +++ b/test/sse_sum_test.cc
@@ -173,4 +173,10 @@ &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_avx2))); #endif // HAVE_AVX2 +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, SumSSETest, + ::testing::Values(TestFuncs(&aom_sum_sse_2d_i16_c, + &aom_sum_sse_2d_i16_sve))); +#endif // HAVE_SVE + } // namespace
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index cba33b7..7b98ced 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc
@@ -172,6 +172,14 @@ #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, SumSquaresTest, + ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c, + &aom_sum_squares_2d_i16_sve))); + +#endif // HAVE_SVE + #if HAVE_AVX2 INSTANTIATE_TEST_SUITE_P( AVX2, SumSquaresTest, @@ -200,8 +208,8 @@ for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max; - const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize - : rng_(kMaxSize) + 1; + // Block size is between 64 and 128 * 128 and is always a multiple of 64. + const int n = (rng_(255) + 1) * 64; const uint64_t ref_res = params_.ref_func(src, n); uint64_t tst_res; @@ -221,8 +229,8 @@ for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max; } - const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize - : rng_(kMaxSize) + 1; + // Block size is between 64 and 128 * 128 and is always a multiple of 64. + const int n = (rng_(255) + 1) * 64; const uint64_t ref_res = params_.ref_func(src, n); uint64_t tst_res; @@ -246,6 +254,13 @@ #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, SumSquares1DTest, + ::testing::Values(TestFuncs1D( + aom_sum_squares_i16_c, aom_sum_squares_i16_sve))); + +#endif // HAVE_SVE + typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height); typedef libaom_test::FuncParam<SSEFunc> TestSSEFuncs; @@ -443,6 +458,15 @@ Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); #endif // HAVE_AVX2 +#if HAVE_SVE +#if CONFIG_AV1_HIGHBITDEPTH +TestSSEFuncs sse_sve[] = { TestSSEFuncs(&aom_highbd_sse_c, + &aom_highbd_sse_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, SSETest, + Combine(ValuesIn(sse_sve), Range(4, 129, 4))); +#endif +#endif // HAVE_SVE + ////////////////////////////////////////////////////////////////////////////// // get_blk sum squares test functions ////////////////////////////////////////////////////////////////////////////// @@ -595,6 +619,14 @@ ValuesIn(kValidBlockSize))); #endif // HAVE_NEON +#if HAVE_SVE +TestSSE_SumFuncs sse_sum_sve[] = { TestSSE_SumFuncs(&aom_get_blk_sse_sum_c, + &aom_get_blk_sse_sum_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, SSE_Sum_Test, + Combine(ValuesIn(sse_sum_sve), + ValuesIn(kValidBlockSize))); +#endif // HAVE_SVE + ////////////////////////////////////////////////////////////////////////////// // 2D Variance test functions ////////////////////////////////////////////////////////////////////////////// @@ -885,4 +917,12 @@ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_neon))); #endif // HAVE_NEON + +#if HAVE_SVE + +INSTANTIATE_TEST_SUITE_P(SVE, Highbd2dVarTest, + ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, + &aom_var_2d_u16_sve))); + +#endif // HAVE_SVE } // namespace
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index cc3fb67..16fbb0b 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc
@@ -118,15 +118,8 @@ encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0); encoder->Control(AV1E_SET_DELTAQ_MODE, 0); if (cfg_.g_threads > 1) { - if (cfg_.g_threads == 4) { - encoder->Control(AV1E_SET_TILE_COLUMNS, 2); - encoder->Control(AV1E_SET_TILE_ROWS, 2); - } else if (cfg_.g_threads == 8) { - encoder->Control(AV1E_SET_TILE_COLUMNS, 4); - encoder->Control(AV1E_SET_TILE_ROWS, 2); - } else { - encoder->Control(AV1E_SET_TILE_COLUMNS, cfg_.g_threads >> 1); - } + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_); + encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_); encoder->Control(AV1E_SET_ROW_MT, 1); } if (screen_mode_) { @@ -986,7 +979,7 @@ ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 60); - const int bitrate_array[2] = { 800, 1200 }; + const int bitrate_array[2] = { 1000, 1500 }; cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; ResetModel(); screen_mode_ = 1; @@ -997,9 +990,9 @@ target_layer_bitrate_[2] = cfg_.rc_target_bitrate; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { - ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50) + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.40) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7) + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 2.0) << " The datarate for the file is greater than target by too much!"; } // Top temporal layers are non_reference, so exlcude them from @@ -1575,6 +1568,8 @@ const int bitrate_array[2] = { 600, 1200 }; cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; ResetModel(); + tile_columns_ = 1; + tile_rows_ = 0; set_speed_per_layer_ = true; number_temporal_layers_ = 3; number_spatial_layers_ = 3; @@ -1618,6 +1613,8 @@ const int bitrate_array[2] = { 600, 1200 }; cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; ResetModel(); + tile_columns_ = 1; + tile_rows_ = 0; number_temporal_layers_ = 3; number_spatial_layers_ = 3; // SL0 @@ -1644,6 +1641,37 @@ } } + virtual void BasicRateTargetingSVC2TL1SLHDMultiThread4Test() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + cfg_.g_threads = 4; + + ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + const int bitrate_array[2] = { 600, 1200 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + tile_columns_ = 1; + tile_rows_ = 1; + number_temporal_layers_ = 2; + number_spatial_layers_ = 1; + target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100; + target_layer_bitrate_[1] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45) + << " The datarate for the file is greater than target by too much!"; + } + } + virtual void BasicRateTargetingSVC3TL3SLHDMultiThread4Test() { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; @@ -1660,6 +1688,8 @@ const int bitrate_array[2] = { 600, 1200 }; cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; ResetModel(); + tile_columns_ = 1; + tile_rows_ = 1; number_temporal_layers_ = 3; number_spatial_layers_ = 3; // SL0 @@ -2504,8 +2534,15 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread2) { BasicRateTargetingSVC3TL3SLHDMultiThread2Test(); } + +// Check basic rate targeting for CBR, for 1 spatial, 2 temporal layers, +// for 4 threads, 2 tile_columns, 2 tiles_rows, row-mt enabled. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLHDMultiThread4) { + BasicRateTargetingSVC2TL1SLHDMultiThread4Test(); +} + // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers, -// for 4 threads, 4 tile_columns, row-mt enabled. +// for 4 threads, 2 tile_columns, 2 tiles_rows, row-mt enabled. TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread4) { BasicRateTargetingSVC3TL3SLHDMultiThread4Test(); }
diff --git a/test/test.cmake b/test/test.cmake index 2ca7e64..02e85f8 100644 --- a/test/test.cmake +++ b/test/test.cmake
@@ -28,8 +28,7 @@ set(AOM_TEST_SOURCE_VARS "${AOM_TEST_SOURCE_VARS}" PARENT_SCOPE) endfunction() -list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c" - "${AOM_ROOT}/test/test_libaom.cc") +list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_ROOT}/test/test_libaom.cc") add_to_libaom_test_srcs(AOM_UNIT_TEST_WRAPPER_SOURCES) list(APPEND AOM_UNIT_TEST_COMMON_SOURCES @@ -102,7 +101,7 @@ list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc") list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h") add_to_libaom_test_srcs(AOM_UNIT_TEST_WEBM_SOURCES) -list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c" +list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_ROOT}/test/test_intra_pred_speed.cc") if(CONFIG_AV1_DECODER) @@ -157,12 +156,6 @@ "${AOM_ROOT}/test/simd_cmp_impl.h" "${AOM_ROOT}/test/simd_impl.h") - if(HAVE_NEON) - list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON - "${AOM_ROOT}/test/simd_cmp_neon.cc") - add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_NEON) - endif() - if(HAVE_SSE2) list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2 "${AOM_ROOT}/test/simd_cmp_sse2.cc") @@ -216,6 +209,7 @@ "${AOM_ROOT}/test/fdct4x4_test.cc" "${AOM_ROOT}/test/fft_test.cc" "${AOM_ROOT}/test/firstpass_test.cc" + "${AOM_ROOT}/test/frame_resize_test.cc" "${AOM_ROOT}/test/fwht4x4_test.cc" "${AOM_ROOT}/test/hadamard_test.cc" "${AOM_ROOT}/test/horver_correlation_test.cc" @@ -283,29 +277,24 @@ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES "${AOM_ROOT}/test/coding_path_sync.cc") endif() - if(CONFIG_REALTIME_ONLY) - list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES - "${AOM_ROOT}/test/altref_test.cc" - "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc" - "${AOM_ROOT}/test/av1_ext_tile_test.cc" - "${AOM_ROOT}/test/cnn_test.cc" - "${AOM_ROOT}/test/decode_multithreaded_test.cc" - "${AOM_ROOT}/test/error_resilience_test.cc" - "${AOM_ROOT}/test/kf_test.cc" - "${AOM_ROOT}/test/lossless_test.cc" - "${AOM_ROOT}/test/sb_multipass_test.cc" - "${AOM_ROOT}/test/sb_qp_sweep_test.cc" - "${AOM_ROOT}/test/selfguided_filter_test.cc" - "${AOM_ROOT}/test/screen_content_test.cc" - "${AOM_ROOT}/test/still_picture_test.cc" - "${AOM_ROOT}/test/tile_independence_test.cc" - "${AOM_ROOT}/test/tpl_model_test.cc") - endif() endif() - - if(HAVE_NEON) - list(APPEND AOM_UNIT_TEST_COMMON_SOURCES - "${AOM_ROOT}/test/simd_neon_test.cc") + if(CONFIG_REALTIME_ONLY) + list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/altref_test.cc" + "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc" + "${AOM_ROOT}/test/av1_ext_tile_test.cc" + "${AOM_ROOT}/test/cnn_test.cc" + "${AOM_ROOT}/test/decode_multithreaded_test.cc" + "${AOM_ROOT}/test/error_resilience_test.cc" + "${AOM_ROOT}/test/kf_test.cc" + "${AOM_ROOT}/test/lossless_test.cc" + "${AOM_ROOT}/test/sb_multipass_test.cc" + "${AOM_ROOT}/test/sb_qp_sweep_test.cc" + "${AOM_ROOT}/test/selfguided_filter_test.cc" + "${AOM_ROOT}/test/screen_content_test.cc" + "${AOM_ROOT}/test/still_picture_test.cc" + "${AOM_ROOT}/test/tile_independence_test.cc" + "${AOM_ROOT}/test/tpl_model_test.cc") endif() if(CONFIG_FPMT_TEST AND (NOT CONFIG_REALTIME_ONLY)) @@ -473,6 +462,7 @@ add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES} $<TARGET_OBJECTS:aom_common_app_util> + $<TARGET_OBJECTS:aom_usage_exit> $<TARGET_OBJECTS:test_aom_common>) set_property(TARGET test_libaom PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER}) list(APPEND AOM_APP_TARGETS test_libaom) @@ -495,9 +485,9 @@ endif() if(NOT BUILD_SHARED_LIBS) - add_executable(test_intra_pred_speed - ${AOM_TEST_INTRA_PRED_SPEED_SOURCES} - $<TARGET_OBJECTS:aom_common_app_util>) + add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES} + $<TARGET_OBJECTS:aom_common_app_util> + $<TARGET_OBJECTS:aom_usage_exit>) set_property(TARGET test_intra_pred_speed PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER}) target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom @@ -645,11 +635,9 @@ if(CONFIG_AV1_ENCODER AND ENABLE_TESTS AND CONFIG_WEBM_IO - AND NOT BUILD_SHARED_LIBS AND NOT CONFIG_REALTIME_ONLY) add_executable(test_aom_rc ${AOM_RC_TEST_SOURCES}) - target_link_libraries(test_aom_rc ${AOM_LIB_LINK_TYPE} aom aom_av1_rc - aom_gtest aom_gmock webm) + target_link_libraries(test_aom_rc ${AOM_LIB_LINK_TYPE} aom_av1_rc aom_gtest) set_property(TARGET test_aom_rc PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER}) list(APPEND AOM_APP_TARGETS test_aom_rc) endif()
diff --git a/test/test_libaom.cc b/test/test_libaom.cc index fbd7f2e..26abbb0 100644 --- a/test/test_libaom.cc +++ b/test/test_libaom.cc
@@ -62,6 +62,7 @@ if (!(caps & HAS_NEON_DOTPROD)) append_negative_gtest_filter("NEON_DOTPROD"); if (!(caps & HAS_NEON_I8MM)) append_negative_gtest_filter("NEON_I8MM"); if (!(caps & HAS_SVE)) append_negative_gtest_filter("SVE"); + if (!(caps & HAS_SVE2)) append_negative_gtest_filter("SVE2"); #elif AOM_ARCH_ARM const int caps = aom_arm_cpu_caps(); if (!(caps & HAS_NEON)) append_negative_gtest_filter("NEON");
diff --git a/test/tools_common.sh b/test/tools_common.sh index f2d1802..cb9eba1 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh
@@ -312,7 +312,11 @@ # Combine environment and actual tests. local tests_to_run="${env_tests} ${tests_to_filter}" - check_version_strings + # av1_c_vs_simd_encode is a standalone test, and it doesn't need to check the + # version string. + if [ "${test_name}" != "av1_c_vs_simd_encode" ]; then + check_version_strings + fi # Run tests. for test in ${tests_to_run}; do @@ -464,6 +468,8 @@ AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no} +# This checking requires config/aom_config.c that is available in Jenkins +# testing. if [ "$(is_windows_target)" = "yes" ]; then AOM_TEST_EXE_SUFFIX=".exe" fi
diff --git a/test/variance_test.cc b/test/variance_test.cc index adca1b1..261c080 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc
@@ -2147,6 +2147,27 @@ MseParams(3, 3, &aom_highbd_8_mse8x8_neon_dotprod, 8))); #endif // HAVE_NEON_DOTPROD +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, MseHBDWxHTest, + ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sve, 10), + MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sve, 10), + MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sve, 10), + MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sve, + 10))); + +INSTANTIATE_TEST_SUITE_P( + SVE, AvxHBDMseTest, + ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sve, 12), + MseParams(4, 3, &aom_highbd_12_mse16x8_sve, 12), + MseParams(3, 4, &aom_highbd_12_mse8x16_sve, 12), + MseParams(3, 3, &aom_highbd_12_mse8x8_sve, 12), + MseParams(4, 4, &aom_highbd_10_mse16x16_sve, 10), + MseParams(4, 3, &aom_highbd_10_mse16x8_sve, 10), + MseParams(3, 4, &aom_highbd_10_mse8x16_sve, 10), + MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10))); +#endif // HAVE_SVE + const VarianceParams kArrayHBDVariance_c[] = { VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12), VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12), @@ -2764,64 +2785,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum16x16DualTest, ::testing::ValuesIn(kArrayGetSseSum16x16Dual_sse2)); -const SubpelVarianceParams kArraySubpelVariance_sse2[] = { - SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0), - SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0), - SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0), - SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0), - SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0), - SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0), - SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0), - SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0), - SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0), - SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0), - SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0), - SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0), - SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0), - SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0), - SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0), - SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0), -#if !CONFIG_REALTIME_ONLY - SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0), - SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0), - SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0), - SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0), - SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0), - SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0), -#endif -}; -INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelVarianceTest, - ::testing::ValuesIn(kArraySubpelVariance_sse2)); - -const SubpelAvgVarianceParams kArraySubpelAvgVariance_sse2[] = { - SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2, 0), - SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2, 0), - SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2, 0), - SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0), - SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0), - SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0), - SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0), - SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0), - SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0), - SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0), - SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0), - SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0), - SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0), - SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0), - SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0), - SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0), -#if !CONFIG_REALTIME_ONLY - SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0), - SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0), - SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0), - SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0), - SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0), - SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2, 0), -#endif -}; -INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelAvgVarianceTest, - ::testing::ValuesIn(kArraySubpelAvgVariance_sse2)); - #if CONFIG_AV1_HIGHBITDEPTH #if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P( @@ -2831,6 +2794,15 @@ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sse2, 10), MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sse2, 10))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, AvxHBDMseTest, + ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2, 12), + MseParams(3, 3, &aom_highbd_12_mse8x8_sse2, 12), + MseParams(4, 4, &aom_highbd_10_mse16x16_sse2, 10), + MseParams(3, 3, &aom_highbd_10_mse8x8_sse2, 10), + MseParams(4, 4, &aom_highbd_8_mse16x16_sse2, 8), + MseParams(3, 3, &aom_highbd_8_mse8x8_sse2, 8))); #endif // HAVE_SSE2 #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( @@ -2857,14 +2829,11 @@ 12))); #endif // HAVE_SSE4_1 +#if HAVE_AVX2 INSTANTIATE_TEST_SUITE_P( - SSE2, AvxHBDMseTest, - ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2, 12), - MseParams(3, 3, &aom_highbd_12_mse8x8_sse2, 12), - MseParams(4, 4, &aom_highbd_10_mse16x16_sse2, 10), - MseParams(3, 3, &aom_highbd_10_mse8x8_sse2, 10), - MseParams(4, 4, &aom_highbd_8_mse16x16_sse2, 8), - MseParams(3, 3, &aom_highbd_8_mse8x8_sse2, 8))); + AVX2, AvxHBDMseTest, + ::testing::Values(MseParams(4, 4, &aom_highbd_10_mse16x16_avx2, 10))); +#endif // HAVE_AVX2 const VarianceParams kArrayHBDVariance_sse2[] = { VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12), @@ -4262,4 +4231,84 @@ #endif // HAVE_NEON_DOTPROD +#if HAVE_SVE + +#if CONFIG_AV1_HIGHBITDEPTH +const VarianceParams kArrayHBDVariance_sve[] = { + VarianceParams(7, 7, &aom_highbd_12_variance128x128_sve, 12), + VarianceParams(7, 6, &aom_highbd_12_variance128x64_sve, 12), + VarianceParams(6, 7, &aom_highbd_12_variance64x128_sve, 12), + VarianceParams(6, 6, &aom_highbd_12_variance64x64_sve, 12), + VarianceParams(6, 5, &aom_highbd_12_variance64x32_sve, 12), + VarianceParams(5, 6, &aom_highbd_12_variance32x64_sve, 12), + VarianceParams(5, 5, &aom_highbd_12_variance32x32_sve, 12), + VarianceParams(5, 4, &aom_highbd_12_variance32x16_sve, 12), + VarianceParams(4, 5, &aom_highbd_12_variance16x32_sve, 12), + VarianceParams(4, 4, &aom_highbd_12_variance16x16_sve, 12), + VarianceParams(4, 3, &aom_highbd_12_variance16x8_sve, 12), + VarianceParams(3, 4, &aom_highbd_12_variance8x16_sve, 12), + VarianceParams(3, 3, &aom_highbd_12_variance8x8_sve, 12), + VarianceParams(3, 2, &aom_highbd_12_variance8x4_sve, 12), + VarianceParams(2, 3, &aom_highbd_12_variance4x8_sve, 12), + VarianceParams(2, 2, &aom_highbd_12_variance4x4_sve, 12), + VarianceParams(7, 7, &aom_highbd_10_variance128x128_sve, 10), + VarianceParams(7, 6, &aom_highbd_10_variance128x64_sve, 10), + VarianceParams(6, 7, &aom_highbd_10_variance64x128_sve, 10), + VarianceParams(6, 6, &aom_highbd_10_variance64x64_sve, 10), + VarianceParams(6, 5, &aom_highbd_10_variance64x32_sve, 10), + VarianceParams(5, 6, &aom_highbd_10_variance32x64_sve, 10), + VarianceParams(5, 5, &aom_highbd_10_variance32x32_sve, 10), + VarianceParams(5, 4, &aom_highbd_10_variance32x16_sve, 10), + VarianceParams(4, 5, &aom_highbd_10_variance16x32_sve, 10), + VarianceParams(4, 4, &aom_highbd_10_variance16x16_sve, 10), + VarianceParams(4, 3, &aom_highbd_10_variance16x8_sve, 10), + VarianceParams(3, 4, &aom_highbd_10_variance8x16_sve, 10), + VarianceParams(3, 3, &aom_highbd_10_variance8x8_sve, 10), + VarianceParams(3, 2, &aom_highbd_10_variance8x4_sve, 10), + VarianceParams(2, 3, &aom_highbd_10_variance4x8_sve, 10), + VarianceParams(2, 2, &aom_highbd_10_variance4x4_sve, 10), + VarianceParams(7, 7, &aom_highbd_8_variance128x128_sve, 8), + VarianceParams(7, 6, &aom_highbd_8_variance128x64_sve, 8), + VarianceParams(6, 7, &aom_highbd_8_variance64x128_sve, 8), + VarianceParams(6, 6, &aom_highbd_8_variance64x64_sve, 8), + VarianceParams(6, 5, &aom_highbd_8_variance64x32_sve, 8), + VarianceParams(5, 6, &aom_highbd_8_variance32x64_sve, 8), + VarianceParams(5, 5, &aom_highbd_8_variance32x32_sve, 8), + VarianceParams(5, 4, &aom_highbd_8_variance32x16_sve, 8), + VarianceParams(4, 5, &aom_highbd_8_variance16x32_sve, 8), + VarianceParams(4, 4, &aom_highbd_8_variance16x16_sve, 8), + VarianceParams(4, 3, &aom_highbd_8_variance16x8_sve, 8), + VarianceParams(3, 4, &aom_highbd_8_variance8x16_sve, 8), + VarianceParams(3, 3, &aom_highbd_8_variance8x8_sve, 8), + VarianceParams(3, 2, &aom_highbd_8_variance8x4_sve, 8), + VarianceParams(2, 3, &aom_highbd_8_variance4x8_sve, 8), + VarianceParams(2, 2, &aom_highbd_8_variance4x4_sve, 8), +#if !CONFIG_REALTIME_ONLY + VarianceParams(6, 4, &aom_highbd_12_variance64x16_sve, 12), + VarianceParams(4, 6, &aom_highbd_12_variance16x64_sve, 12), + VarianceParams(5, 3, &aom_highbd_12_variance32x8_sve, 12), + VarianceParams(3, 5, &aom_highbd_12_variance8x32_sve, 12), + VarianceParams(4, 2, &aom_highbd_12_variance16x4_sve, 12), + VarianceParams(2, 4, &aom_highbd_12_variance4x16_sve, 12), + VarianceParams(6, 4, &aom_highbd_10_variance64x16_sve, 10), + VarianceParams(4, 6, &aom_highbd_10_variance16x64_sve, 10), + VarianceParams(5, 3, &aom_highbd_10_variance32x8_sve, 10), + VarianceParams(3, 5, &aom_highbd_10_variance8x32_sve, 10), + VarianceParams(4, 2, &aom_highbd_10_variance16x4_sve, 10), + VarianceParams(2, 4, &aom_highbd_10_variance4x16_sve, 10), + VarianceParams(6, 4, &aom_highbd_8_variance64x16_sve, 8), + VarianceParams(4, 6, &aom_highbd_8_variance16x64_sve, 8), + VarianceParams(5, 3, &aom_highbd_8_variance32x8_sve, 8), + VarianceParams(3, 5, &aom_highbd_8_variance8x32_sve, 8), + VarianceParams(4, 2, &aom_highbd_8_variance16x4_sve, 8), + VarianceParams(2, 4, &aom_highbd_8_variance4x16_sve, 8), +#endif +}; + +INSTANTIATE_TEST_SUITE_P(SVE, AvxHBDVarianceTest, + ::testing::ValuesIn(kArrayHBDVariance_sve)); + +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_SVE + } // namespace
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc index f0be7d2..8844ba7 100644 --- a/test/warp_filter_test.cc +++ b/test/warp_filter_test.cc
@@ -88,6 +88,12 @@ INSTANTIATE_TEST_SUITE_P( SVE, AV1WarpFilterTest, libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve)); + +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SVE, AV1HighbdWarpFilterTest, + libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_sve)); +#endif // CONFIG_AV1_HIGHBITDEPTH #endif // HAVE_SVE } // namespace
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc index 470c980..b7c60c2 100644 --- a/test/warp_filter_test_util.cc +++ b/test/warp_filter_test_util.cc
@@ -18,6 +18,7 @@ using std::tuple; namespace libaom_test { +namespace { int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits, int rnd_gen_zeros) { @@ -114,6 +115,8 @@ } } +} // namespace + namespace AV1WarpFilter { ::testing::internal::ParamGenerator<WarpTestParams> BuildParams( warp_affine_func filter) {
diff --git a/test/wiener_test.cc b/test/wiener_test.cc index b995c84..77d2769 100644 --- a/test/wiener_test.cc +++ b/test/wiener_test.cc
@@ -158,11 +158,11 @@ } } -void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, - int16_t *d, int16_t *s, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, int src_stride, - int64_t *M, int64_t *H, - int use_downsampled_wiener_stats) { +static void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *d, int16_t *s, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H, int use_downsampled_wiener_stats) { if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) { compute_stats_win_opt_c(wiener_win, dgd, src, d, s, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, @@ -397,6 +397,12 @@ ::testing::Values(av1_compute_stats_neon)); #endif // HAVE_NEON +#if HAVE_SVE + +INSTANTIATE_TEST_SUITE_P(SVE, WienerTest, + ::testing::Values(av1_compute_stats_sve)); +#endif // HAVE_SVE + } // namespace wiener_lowbd #if CONFIG_AV1_HIGHBITDEPTH @@ -513,26 +519,29 @@ } } -void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, - aom_bit_depth_t bit_depth) { +static void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *d, + int16_t *s, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) { compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd, src, d, s, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + bit_depth); } } static const int kIterations = 100; typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, + const uint8_t *src, int16_t *d, int16_t *s, + int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, + int64_t *M, int64_t *H, aom_bit_depth_t bit_depth); typedef std::tuple<const compute_stats_Func> WienerTestParam; @@ -546,11 +555,17 @@ dgd_buf = (uint16_t *)aom_memalign( 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf)); ASSERT_NE(dgd_buf, nullptr); + const size_t buf_size = + sizeof(*buf) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + buf = (int16_t *)aom_memalign(32, buf_size); + ASSERT_NE(buf, nullptr); + memset(buf, 0, buf_size); target_func_ = GET_PARAM(0); } void TearDown() override { aom_free(src_buf); aom_free(dgd_buf); + aom_free(buf); } void RunWienerTest(const int32_t wiener_win, int32_t run_times, aom_bit_depth_t bit_depth); @@ -562,6 +577,7 @@ libaom_test::ACMRandom rng_; uint16_t *src_buf; uint16_t *dgd_buf; + int16_t *buf; }; void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, @@ -589,6 +605,9 @@ const int dgd_stride = h_end; const int src_stride = MAX_DATA_BLOCK; const int iters = run_times == 1 ? kIterations : 2; + int16_t *dgd_avg = buf; + int16_t *src_avg = + buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX); for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { dgd_buf[i] = rng_.Rand16() % (1 << bit_depth); @@ -601,16 +620,17 @@ aom_usec_timer timer; aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, - v_start, v_end, dgd_stride, src_stride, M_ref, - H_ref, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M_ref, H_ref, bit_depth); } aom_usec_timer_mark(&timer); const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, - dgd_stride, src_stride, M_test, H_test, bit_depth); + target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_test, H_test, + bit_depth); } aom_usec_timer_mark(&timer); const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); @@ -657,6 +677,9 @@ const int dgd_stride = h_end; const int src_stride = MAX_DATA_BLOCK; const int iters = 1; + int16_t *dgd_avg = buf; + int16_t *src_avg = + buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX); for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { // Fill with alternating extreme values to maximize difference with // the average. @@ -668,12 +691,13 @@ dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin); const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf); - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M_ref, H_ref, - bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M_ref, H_ref, bit_depth); - target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, - dgd_stride, src_stride, M_test, H_test, bit_depth); + target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_test, H_test, + bit_depth); int failed = 0; for (int i = 0; i < wiener_win2; ++i) { @@ -743,6 +767,11 @@ ::testing::Values(av1_compute_stats_highbd_neon)); #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, WienerTestHighbd, + ::testing::Values(av1_compute_stats_highbd_sve)); +#endif // HAVE_SVE + // A test that reproduces b/274668506: signed integer overflow in // update_a_sep_sym(). TEST(SearchWienerTest, 10bitSignedIntegerOverflowInUpdateASepSym) { @@ -1691,6 +1720,97 @@ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); } +// A test that reproduces crbug.com/oss-fuzz/68195: signed integer overflow in +// linsolve_wiener(). +TEST(SearchWienerTest, 8bitSignedIntegerOverflowInLinsolveWiener) { + constexpr int kWidth = 4; + constexpr int kHeight = 3; + constexpr unsigned char kBuffer[kWidth * kHeight] = { + // Y plane: + 50, 167, 190, 194, 27, 29, 204, 182, 133, 239, 64, 179, + }; + unsigned char *img_data = const_cast<unsigned char *>(kBuffer); + + aom_image_t img; + EXPECT_EQ(&img, + aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1, img_data)); + img.cp = AOM_CICP_CP_UNSPECIFIED; + img.tc = AOM_CICP_TC_UNSPECIFIED; + img.mc = AOM_CICP_MC_UNSPECIFIED; + img.monochrome = 1; + img.csp = AOM_CSP_UNKNOWN; + img.range = AOM_CR_FULL_RANGE; + img.planes[1] = img.planes[2] = nullptr; + img.stride[1] = img.stride[2] = 0; + + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY)); + cfg.rc_end_usage = AOM_Q; + cfg.g_profile = 0; + cfg.g_bit_depth = AOM_BITS_8; + cfg.g_input_bit_depth = 8; + cfg.g_w = kWidth; + cfg.g_h = kHeight; + cfg.g_threads = 32; + cfg.monochrome = 1; + cfg.rc_min_quantizer = 50; + cfg.rc_max_quantizer = 57; + aom_codec_ctx_t enc; + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 53)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 1)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 1)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6)); + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE)); + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM)); + + // Encode frame + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0)); + aom_codec_iter_t iter = nullptr; + const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_EQ(pkt, nullptr); + + // Encode frame + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0)); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Flush encoder + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0)); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_NE(pkt, nullptr); + EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + // pkt->data.frame.flags is 0x1f0011. + EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY); + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Flush encoder + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0)); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_NE(pkt, nullptr); + EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + // pkt->data.frame.flags is 0x0. + EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u); + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Flush encoder + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0)); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); +} + // A test that reproduces b/259173819: signed integer overflow in // linsolve_wiener(). TEST(SearchWienerTest, 10bitSignedIntegerOverflowInLinsolveWiener) { @@ -1768,5 +1888,151 @@ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); } +// A test that reproduces b/330639949: signed integer overflow in +// linsolve_wiener(). +TEST(SearchWienerTest, 12bitSignedIntegerOverflowInLinsolveWiener) { + constexpr int kWidth = 173; + constexpr int kHeight = 3; + // Since the image format is YUV 4:2:0, aom_img_wrap() expects the buffer is + // allocated with width and height aligned to a multiple of 2. Align the + // width to a multiple of 2 so that the stride set by aom_img_wrap() is + // correct. It is not necessary to align the height to a multiple of 2 + // because aom_codec_encode() will only read cfg.g_h rows. + static constexpr uint16_t kBuffer[(kWidth + 1) * kHeight] = { + // Y plane: + // Row: + 0, 0, 369, 0, 4095, 873, 4095, 4095, 0, 571, 4023, 0, 1028, 58, 556, 0, 0, + 1875, 16, 1043, 4095, 0, 1671, 1990, 0, 4095, 2932, 3117, 4095, 0, 0, 0, + 4095, 4095, 4095, 4095, 4095, 4095, 508, 4095, 0, 0, 4095, 4095, 4095, 0, + 4095, 4095, 0, 197, 4095, 1475, 1127, 4095, 0, 1570, 1881, 4095, 1215, 4095, + 0, 0, 1918, 4095, 0, 4095, 3415, 0, 732, 122, 1087, 0, 0, 0, 0, 0, 1012, + 4095, 0, 4095, 4095, 0, 0, 4095, 1931, 4095, 0, 4095, 4095, 4095, 4095, 570, + 4095, 4095, 0, 2954, 0, 0, 0, 1925, 3802, 0, 4095, 55, 0, 4095, 760, 4095, + 0, 3313, 4095, 4095, 4095, 0, 218, 799, 4095, 0, 4095, 2455, 4095, 0, 0, + 611, 4095, 3060, 1669, 0, 0, 4095, 3589, 3903, 0, 3427, 1903, 0, 4095, 3789, + 4095, 4095, 107, 2064, 4095, 2764, 4095, 0, 0, 0, 3498, 0, 0, 1336, 4095, + 4095, 3480, 0, 545, 673, 4095, 0, 4095, 4095, 3175, 4095, 1623, 4095, 0, + 540, 4095, 4095, 14, 429, 0, 0, + // Row: + 0, 4095, 4095, 0, 1703, 3003, 968, 1313, 4095, 613, 4095, 3918, 112, 4095, + 0, 4095, 2211, 88, 4051, 1203, 2005, 4095, 4095, 0, 2106, 0, 4095, 0, 4095, + 4095, 4095, 0, 3261, 0, 4095, 0, 1184, 4095, 4095, 818, 4095, 0, 4095, 1292, + 4095, 0, 4095, 4095, 0, 4095, 4095, 0, 0, 346, 906, 974, 4095, 4095, 4095, + 4095, 0, 4095, 3225, 2547, 4095, 0, 0, 2705, 2933, 4095, 0, 0, 3579, 0, + 4095, 4095, 4095, 1872, 4095, 298, 2961, 0, 0, 2805, 0, 0, 1210, 3773, 0, + 1208, 3347, 0, 4095, 0, 0, 0, 4034, 4095, 0, 0, 4095, 0, 0, 0, 3302, 0, 0, + 0, 0, 0, 4095, 4095, 0, 2609, 4095, 0, 1831, 4095, 0, 2463, 4095, 4095, + 4095, 4095, 752, 4095, 4095, 41, 1829, 2975, 227, 2505, 2719, 1059, 4071, + 4095, 4095, 3859, 0, 0, 0, 0, 4095, 2423, 4095, 4095, 4095, 4095, 4095, + 1466, 0, 0, 4095, 121, 0, 0, 4095, 0, 0, 3328, 4095, 4095, 0, 1172, 0, 2938, + 0, 4095, 0, 0, 0, 4095, 1821, 0, + // Row: + 4095, 4095, 4095, 4095, 3487, 4095, 0, 0, 0, 3367, 4095, 4095, 1139, 4095, + 4095, 169, 1300, 1840, 4095, 3508, 4095, 618, 4095, 4095, 4095, 53, 4095, + 4095, 4095, 4095, 4055, 0, 0, 0, 4095, 4095, 0, 0, 0, 0, 1919, 2415, 1485, + 458, 4095, 4095, 3176, 4095, 0, 0, 4095, 4095, 617, 3631, 4095, 4095, 0, 0, + 3983, 4095, 4095, 681, 1685, 4095, 4095, 0, 1783, 25, 4095, 0, 0, 4095, + 4095, 0, 2075, 0, 4095, 4095, 4095, 0, 773, 3407, 0, 4095, 4095, 0, 0, 4095, + 4095, 4095, 4095, 4095, 0, 0, 0, 0, 4095, 0, 1804, 0, 0, 3169, 3576, 502, 0, + 0, 4095, 0, 4095, 0, 4095, 4095, 4095, 0, 4095, 779, 0, 4095, 0, 0, 0, 4095, + 0, 0, 4095, 4095, 4095, 4095, 0, 0, 4095, 4095, 2134, 4095, 4020, 2990, + 3949, 4095, 4095, 4095, 4095, 4095, 0, 4095, 4095, 2829, 4095, 4095, 4095, + 0, 197, 2328, 3745, 0, 3412, 190, 4095, 4095, 4095, 2809, 3953, 0, 4095, + 1502, 2514, 3866, 0, 0, 4095, 4095, 1878, 129, 4095, 0 + }; + unsigned char *img_data = + reinterpret_cast<unsigned char *>(const_cast<uint16_t *>(kBuffer)); + + aom_image_t img; + EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I42016, kWidth, kHeight, 1, + img_data)); + img.cp = AOM_CICP_CP_UNSPECIFIED; + img.tc = AOM_CICP_TC_UNSPECIFIED; + img.mc = AOM_CICP_MC_UNSPECIFIED; + img.monochrome = 1; + img.csp = AOM_CSP_UNKNOWN; + img.range = AOM_CR_FULL_RANGE; + img.planes[1] = img.planes[2] = nullptr; + img.stride[1] = img.stride[2] = 0; + + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY)); + cfg.rc_end_usage = AOM_Q; + cfg.g_profile = 2; + cfg.g_bit_depth = AOM_BITS_12; + cfg.g_input_bit_depth = 12; + cfg.g_w = kWidth; + cfg.g_h = kHeight; + cfg.g_lag_in_frames = 0; + cfg.g_threads = 18; + cfg.monochrome = 1; + cfg.rc_min_quantizer = 0; + cfg.rc_max_quantizer = 51; + aom_codec_ctx_t enc; + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 25)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 4)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6)); + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE)); + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM)); + + // Encode frame + EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK); + aom_codec_iter_t iter = nullptr; + const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_NE(pkt, nullptr); + EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + // pkt->data.frame.flags is 0x1f0011. + EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY); + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Encode frame + EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_NE(pkt, nullptr); + EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + // pkt->data.frame.flags is 0x20000. + EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u); + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Encode frame + EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_NE(pkt, nullptr); + EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + // pkt->data.frame.flags is 0x20000. + EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u); + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Encode frame + EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + ASSERT_NE(pkt, nullptr); + EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + // pkt->data.frame.flags is 0x20000. + EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u); + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + // Flush encoder + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0)); + iter = nullptr; + pkt = aom_codec_get_cx_data(&enc, &iter); + EXPECT_EQ(pkt, nullptr); + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); +} + } // namespace wiener_highbd #endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/libwebm/README.libaom b/third_party/libwebm/README.libaom index ee350a5..1eb0ce9 100644 --- a/third_party/libwebm/README.libaom +++ b/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 1930e3ca23b007f3ff11d98a570077be6201957e +Version: affd7f4d9644aa2b65981fa6c7616400be760e6e License: BSD License File: LICENSE.TXT
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc index faaf016..21e51be 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -65,7 +65,8 @@ if (dst == NULL) return false; - strcpy(dst, src); // NOLINT + memcpy(dst, src, size - 1); + dst[size - 1] = '\0'; return true; } @@ -919,11 +920,8 @@ const size_t length = strlen(codec_id) + 1; codec_id_ = new (std::nothrow) char[length]; // NOLINT if (codec_id_) { -#ifdef _MSC_VER - strcpy_s(codec_id_, length, codec_id); -#else - strcpy(codec_id_, codec_id); -#endif + memcpy(codec_id_, codec_id, length - 1); + codec_id_[length - 1] = '\0'; } } } @@ -936,11 +934,8 @@ const size_t length = strlen(language) + 1; language_ = new (std::nothrow) char[length]; // NOLINT if (language_) { -#ifdef _MSC_VER - strcpy_s(language_, length, language); -#else - strcpy(language_, language); -#endif + memcpy(language_, language, length - 1); + language_[length - 1] = '\0'; } } } @@ -952,11 +947,8 @@ const size_t length = strlen(name) + 1; name_ = new (std::nothrow) char[length]; // NOLINT if (name_) { -#ifdef _MSC_VER - strcpy_s(name_, length, name); -#else - strcpy(name_, name); -#endif + memcpy(name_, name, length - 1); + name_[length - 1] = '\0'; } } } @@ -1559,11 +1551,8 @@ const size_t length = strlen(colour_space) + 1; colour_space_ = new (std::nothrow) char[length]; // NOLINT if (colour_space_) { -#ifdef _MSC_VER - strcpy_s(colour_space_, length, colour_space); -#else - strcpy(colour_space_, colour_space); -#endif + memcpy(colour_space_, colour_space, length - 1); + colour_space_[length - 1] = '\0'; } } } @@ -2856,13 +2845,13 @@ uint32_t SeekHead::GetId(int index) const { if (index < 0 || index >= kSeekEntryCount) - return UINT_MAX; + return UINT32_MAX; return seek_entry_id_[index]; } uint64_t SeekHead::GetPosition(int index) const { if (index < 0 || index >= kSeekEntryCount) - return ULLONG_MAX; + return UINT64_MAX; return seek_entry_pos_[index]; } @@ -2896,7 +2885,7 @@ muxing_app_(NULL), timecode_scale_(1000000ULL), writing_app_(NULL), - date_utc_(LLONG_MIN), + date_utc_(INT64_MIN), duration_pos_(-1) {} SegmentInfo::~SegmentInfo() { @@ -2927,11 +2916,8 @@ if (!muxing_app_) return false; -#ifdef _MSC_VER - strcpy_s(muxing_app_, app_len, temp); -#else - strcpy(muxing_app_, temp); -#endif + memcpy(muxing_app_, temp, app_len - 1); + muxing_app_[app_len - 1] = '\0'; set_writing_app(temp); if (!writing_app_) @@ -2974,7 +2960,7 @@ if (duration_ > 0.0) size += EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_)); - if (date_utc_ != LLONG_MIN) + if (date_utc_ != INT64_MIN) size += EbmlDateElementSize(libwebm::kMkvDateUTC); size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_); size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_); @@ -2999,7 +2985,7 @@ return false; } - if (date_utc_ != LLONG_MIN) + if (date_utc_ != INT64_MIN) WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_); if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_)) @@ -3022,11 +3008,8 @@ if (!temp_str) return; -#ifdef _MSC_VER - strcpy_s(temp_str, length, app); -#else - strcpy(temp_str, app); -#endif + memcpy(temp_str, app, length - 1); + temp_str[length - 1] = '\0'; delete[] muxing_app_; muxing_app_ = temp_str; @@ -3040,11 +3023,8 @@ if (!temp_str) return; -#ifdef _MSC_VER - strcpy_s(temp_str, length, app); -#else - strcpy(temp_str, app); -#endif + memcpy(temp_str, app, length - 1); + temp_str[length - 1] = '\0'; delete[] writing_app_; writing_app_ = temp_str; @@ -3628,19 +3608,17 @@ if (chunking_ && !strcmp(filename, chunking_base_name_)) return true; - const size_t name_length = strlen(filename) + 1; - char* const temp = new (std::nothrow) char[name_length]; // NOLINT + const size_t filename_length = strlen(filename); + char* const temp = new (std::nothrow) char[filename_length + 1]; // NOLINT if (!temp) return false; -#ifdef _MSC_VER - strcpy_s(temp, name_length, filename); -#else - strcpy(temp, filename); -#endif + memcpy(temp, filename, filename_length); + temp[filename_length] = '\0'; delete[] chunking_base_name_; chunking_base_name_ = temp; + // From this point, strlen(chunking_base_name_) == filename_length if (!UpdateChunkName("chk", &chunk_name_)) return false; @@ -3666,18 +3644,16 @@ if (!chunk_writer_cluster_->Open(chunk_name_)) return false; - const size_t header_length = strlen(filename) + strlen(".hdr") + 1; + const size_t hdr_length = strlen(".hdr"); + const size_t header_length = filename_length + hdr_length + 1; char* const header = new (std::nothrow) char[header_length]; // NOLINT if (!header) return false; -#ifdef _MSC_VER - strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_); - strcat_s(header, header_length, ".hdr"); -#else - strcpy(header, chunking_base_name_); - strcat(header, ".hdr"); -#endif + memcpy(header, chunking_base_name_, filename_length); + memcpy(&header[filename_length], ".hdr", hdr_length); + header[filename_length + hdr_length] = '\0'; + if (!chunk_writer_header_->Open(header)) { delete[] header; return false; @@ -4022,18 +3998,16 @@ snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext); #endif - const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1; + const size_t chunking_base_name_length = strlen(chunking_base_name_); + const size_t ext_chk_length = strlen(ext_chk); + const size_t length = chunking_base_name_length + ext_chk_length + 1; char* const str = new (std::nothrow) char[length]; // NOLINT if (!str) return false; -#ifdef _MSC_VER - strcpy_s(str, length - strlen(ext_chk), chunking_base_name_); - strcat_s(str, length, ext_chk); -#else - strcpy(str, chunking_base_name_); - strcat(str, ext_chk); -#endif + memcpy(str, chunking_base_name_, chunking_base_name_length); + memcpy(&str[chunking_base_name_length], ext_chk, ext_chk_length); + str[chunking_base_name_length + ext_chk_length] = '\0'; delete[] * name; *name = str;
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/libwebm/mkvmuxer/mkvmuxer.h index 8602d82..2c4bb9e 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -1481,7 +1481,7 @@ uint64_t timecode_scale_; // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision. char* writing_app_; - // LLONG_MIN when DateUTC is not set. + // INT64_MIN when DateUTC is not set. int64_t date_utc_; // The file position of the duration element.
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 300b155..f538310 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -607,22 +607,18 @@ void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; *minor = 3; - *build = 1; + *build = 3; *revision = 0; } uint64 MakeUID(unsigned int* seed) { uint64 uid = 0; -#ifdef __MINGW32__ - srand(*seed); -#endif - for (int i = 0; i < 7; ++i) { // avoid problems with 8-byte values uid <<= 8; // TODO(fgalligan): Move random number generation to platform specific code. -#ifdef _MSC_VER +#ifdef _WIN32 (void)seed; const int32 nn = rand(); #elif __ANDROID__ @@ -634,8 +630,6 @@ close(fd); } const int32 nn = temp_num; -#elif defined __MINGW32__ - const int32 nn = rand(); #else const int32 nn = rand_r(seed); #endif
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc index 868afcb..eddbc7e 100644 --- a/third_party/libwebm/mkvparser/mkvparser.cc +++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -55,7 +55,7 @@ void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; minor = 1; - build = 1; + build = 3; revision = 0; } @@ -246,7 +246,8 @@ if (size == 4) { union { float f; - unsigned long ff; + uint32_t ff; + static_assert(sizeof(float) == sizeof(uint32_t), ""); }; ff = 0; @@ -264,7 +265,8 @@ } else { union { double d; - unsigned long long dd; + uint64_t dd; + static_assert(sizeof(double) == sizeof(uint64_t), ""); }; dd = 0; @@ -4569,7 +4571,8 @@ if (dst == NULL) return -1; - strcpy(dst, src); + memcpy(dst, src, len); + dst[len] = '\0'; return 0; }
diff --git a/tools/auto_refactor/c_files/decl_status_code.c b/tools/auto_refactor/c_files/decl_status_code.c index bd445ab..a444553 100644 --- a/tools/auto_refactor/c_files/decl_status_code.c +++ b/tools/auto_refactor/c_files/decl_status_code.c
@@ -13,17 +13,17 @@ int x; } T1; -int parse_decl_node_2() { int arr[3]; } +int parse_decl_node_2(void) { int arr[3]; } -int parse_decl_node_3() { int *a; } +int parse_decl_node_3(void) { int *a; } -int parse_decl_node_4() { T1 t1[3]; } +int parse_decl_node_4(void) { T1 t1[3]; } -int parse_decl_node_5() { T1 *t2[3]; } +int parse_decl_node_5(void) { T1 *t2[3]; } -int parse_decl_node_6() { T1 t3[3][3]; } +int parse_decl_node_6(void) { T1 t3[3][3]; } -int main() { +int main(void) { int a; T1 t1; struct S1 s1;
diff --git a/tools/auto_refactor/c_files/func_in_out.c b/tools/auto_refactor/c_files/func_in_out.c index 67ab58d..7f37bba 100644 --- a/tools/auto_refactor/c_files/func_in_out.c +++ b/tools/auto_refactor/c_files/func_in_out.c
@@ -199,7 +199,7 @@ for (int i = 0; i < 10; ++i) cpi->y--; } -int main() { +int main(void) { int x; VP9_COMP cpi; RD rd;
diff --git a/tools/auto_refactor/c_files/parse_lvalue.c b/tools/auto_refactor/c_files/parse_lvalue.c index 97113ef..fa44d72 100644 --- a/tools/auto_refactor/c_files/parse_lvalue.c +++ b/tools/auto_refactor/c_files/parse_lvalue.c
@@ -39,7 +39,7 @@ return 0; } -int main() { +int main(void) { int x = 0; VP9_COMP cpi; func(&cpi, x);
diff --git a/tools/auto_refactor/c_files/simple_code.c b/tools/auto_refactor/c_files/simple_code.c index dd89a15..902cd1d 100644 --- a/tools/auto_refactor/c_files/simple_code.c +++ b/tools/auto_refactor/c_files/simple_code.c
@@ -48,11 +48,11 @@ c(1); return 0; } -int e() { +int e(void) { c(0); return 0; } -int main() { +int main(void) { int p = 3; S s; s.x = p + 1;
diff --git a/tools/auto_refactor/c_files/struct_code.c b/tools/auto_refactor/c_files/struct_code.c index e14372c..7f24d41 100644 --- a/tools/auto_refactor/c_files/struct_code.c +++ b/tools/auto_refactor/c_files/struct_code.c
@@ -46,4 +46,4 @@ } z; } T7; -int main() {} +int main(void) {}
diff --git a/tools/obu_parser.cc b/tools/obu_parser.cc index 5716b46..4053615 100644 --- a/tools/obu_parser.cc +++ b/tools/obu_parser.cc
@@ -20,6 +20,7 @@ #include "tools/obu_parser.h" namespace aom_tools { +namespace { // Basic OBU syntax // 8 bits: Header @@ -116,6 +117,8 @@ } } +} // namespace + bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) { const int kObuHeaderSizeBytes = 1; const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes;