Merge tag 'v3.7.1' into HEAD
libaom 3.7.1
2023-11-17 v3.7.1
This release includes several bug fixes. This release is ABI
compatible with the last release. See
https://aomedia.googlesource.com/aom/+log/v3.7.0..v3.7.1 for all the
commits in this release.
- Bug Fixes
* aomedia:3349: heap overflow when increasing resolution
* aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on
aom/av1/encoder/motion_search_facade.c
* aomedia:3489: Detect encoder and image high bit depth mismatch
* aomedia:3491: heap-buffer-overflow on frame size change
* b/303023614: Segfault at encoding time for high bit depth images
Bug: aomedia:3513
Change-Id: Iecf1f155b4f0ea2604ef27fef0d6111499ea9bad
diff --git a/.mailmap b/.mailmap
index 7d31a70..6d6e6302 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,12 +1,16 @@
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
-Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Alexander Bokov <alexanderbokov@google.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
Alpha Lam <hclam@google.com> <hclam@chromium.org>
Andrey Norkin <anorkin@netflix.com>
Angie Chiang <angiebird@google.com>
Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
Aasaipriya Chandran <aasaipriya.c@ittiam.com>
Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
Apurve Pandey <apurve.pandey@ittiam.com>
@@ -27,9 +31,10 @@
Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
Guillaume Martres <smarter@ubuntu.com>
Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
-Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
Hui Su <huisu@google.com>
Iole Moccagatta <iole.moccagatta@gmail.com>
Jacky Chen <jackychen@google.com>
@@ -40,13 +45,14 @@
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
John Koleszar <jkoleszar@google.com>
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
Kyle Siefring <siekyleb@amazon.com>
Kyle Siefring <siekyleb@amazon.com> <kylesiefring@gmail.com>
Lin Zheng <linzhen@google.com>
-Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
Luc Trudeau <luc@trud.ca>
Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
Marco Paniconi <marpan@google.com>
@@ -56,6 +62,7 @@
Mingliang Chen <mlchen@google.com>
Monty Montgomery <cmontgomery@mozilla.com>
Mudassir Galaganath <mudassir.galaganath@ittiam.com>
+Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
Mudassir Galaganath <mudassir.galaganath@ittiam.com> Mudassir Galagnath
Nathan E. Egge <negge@mozilla.com>
Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
@@ -72,13 +79,14 @@
Remya Prakasan <remya.prakasan@ittiam.com>
Roger Zhou <youzhou@microsoft.com>
Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
-Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
Ryan Lei <ryanlei@fb.com> <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
Ryan Lei <ryanlei@fb.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
Sai Deng <sdeng@google.com>
Sami Pietilä <samipietila@google.com>
Sarah Parker <sarahparker@google.com>
+Susanna D'Souza <susannad@google.com>
Tamar Levy <tamar.levy@intel.com>
Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
@@ -90,14 +98,16 @@
Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com> <vdziumenko@luxoft.corp-partner.google.com>
Wei-Ting Lin <weitinglin@google.com>
Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
Wenyao Liu <wenyao.liu@cidana.com>
Will Bresnahan <bill.wresnahan@gmail.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
-Yaowu Xu <yaowu@google.com> <Yaowu Xu>
-Yaowu Xu <yaowu@google.com> <yaowu.google.com>
Zhipin Deng <zhipin.deng@intel.com>
Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e0b65f..8e6ca6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -288,9 +288,9 @@
add_library(aom_static STATIC ${target_objs_aom} $<TARGET_OBJECTS:aom_rtcd>)
set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
if(MSVC OR (WIN32 AND NOT MINGW))
- # Fix race condition on the export library file between the two versions.
- # Affects MSVC in all three flavors (stock, Clang/CL, LLVM-- the latter sets
- # MSVC and MINGW both to FALSE).
+ # Fix race condition between the import library and the static library.
+ # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter
+ # sets MSVC and MINGW both to FALSE).
set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll")
endif()
@@ -323,7 +323,7 @@
endif()
endif()
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
"${AOM_ROOT}/av1/ratectrl_rtc.cc")
add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
@@ -336,7 +336,7 @@
# List of object and static library targets.
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
endif()
if(BUILD_SHARED_LIBS)
@@ -387,13 +387,6 @@
endif()
endif()
-if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
- add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
- $<TARGET_OBJECTS:aom_common_app_util>)
- set_property(TARGET ${example} PROPERTY FOLDER examples)
- list(APPEND AOM_APP_TARGETS resize_util)
-endif()
-
if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
$<TARGET_OBJECTS:aom_common_app_util>
@@ -494,14 +487,18 @@
$<TARGET_OBJECTS:aom_common_app_util>
$<TARGET_OBJECTS:aom_encoder_app_util>)
- add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
- $<TARGET_OBJECTS:aom_common_app_util>
- $<TARGET_OBJECTS:aom_encoder_app_util>)
-
# Maintain a list of encoder example targets.
list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
photon_noise_table set_maps simple_encoder scalable_encoder
- twopass_encoder svc_encoder_rtc)
+ twopass_encoder)
+
+ if(NOT BUILD_SHARED_LIBS)
+ add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
+ list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc)
+ endif()
endif()
if(ENABLE_TOOLS)
@@ -852,7 +849,7 @@
# Aomedia documentation rule.
set(DOXYGEN_VERSION_VALUE 0)
if(ENABLE_DOCS)
- include(FindDoxygen)
+ find_package(Doxygen)
if(DOXYGEN_FOUND)
# Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
set(MINIMUM_DOXYGEN_VERSION 1008010)
@@ -942,7 +939,8 @@
get_cmake_property(all_cmake_vars VARIABLES)
foreach(var ${all_cmake_vars})
if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
- AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST")
+ AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST"
+ AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER_")
list(APPEND aom_source_vars ${var})
endif()
endforeach()
diff --git a/README.md b/README.md
index d7b66e0..4e2eb27 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@
The toolchain files available at the time of this writing are:
- arm64-ios.cmake
+ - arm64-linux-clang.cmake
- arm64-linux-gcc.cmake
- arm64-mingw-gcc.cmake
- armv7-ios.cmake
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index e3d8d29..5d0bbe1 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -1006,11 +1006,11 @@
aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
-#define AOM_USAGE_GOOD_QUALITY (0)
+#define AOM_USAGE_GOOD_QUALITY 0u
/*!\brief usage parameter analogous to AV1 REALTIME mode. */
-#define AOM_USAGE_REALTIME (1)
+#define AOM_USAGE_REALTIME 1u
/*!\brief usage parameter analogous to AV1 all intra mode. */
-#define AOM_USAGE_ALL_INTRA (2)
+#define AOM_USAGE_ALL_INTRA 2u
/*!\brief Encode a frame
*
diff --git a/aom/aomcx.h b/aom/aomcx.h
index a5db0a5..f061be3 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -208,14 +208,14 @@
* encoding process, values greater than 0 will increase encoder speed at
* the expense of quality.
*
- * Valid range: 0..10. 0 runs the slowest, and 10 runs the fastest;
+ * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest;
* quality improves as speed decreases (since more compression
* possibilities are explored).
*
- * NOTE: 10 is only allowed in AOM_USAGE_REALTIME. In AOM_USAGE_GOOD_QUALITY
- * and AOM_USAGE_ALL_INTRA, 9 is the highest allowed value. However,
- * AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, AOM_USAGE_REALTIME
- * treats 0..4 the same as 5.
+ * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In
+ * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed
+ * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also,
+ * AOM_USAGE_REALTIME treats 0..4 the same as 5.
*/
AOME_SET_CPUUSED = 13,
@@ -1527,6 +1527,12 @@
*/
AV1E_SET_BITRATE_ONE_PASS_CBR = 163,
+ /*!\brief Codec control to set the maximum number of consecutive frame drops
+ * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of
+ * zero has no effect.
+ */
+ AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
+
// Any new encoder control IDs should be added above.
// Maximum allowed encoder control ID is 229.
// No encoder control ID should be added below.
@@ -1678,10 +1684,10 @@
/*!brief Parameters for setting ref frame config */
typedef struct aom_svc_ref_frame_config {
- // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2),
- // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ // 7 references: The index 0 - 6 refers to the references:
+ // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6).
int reference[7]; /**< Reference flag for each of the 7 references. */
- /*! Buffer slot index for each of 7 references. */
+ /*! Buffer slot index for each of 7 references indexed above. */
int ref_idx[7];
int refresh[8]; /**< Refresh flag for each of the 8 slots. */
} aom_svc_ref_frame_config_t;
@@ -2172,6 +2178,9 @@
AOM_CTRL_USE_TYPE(AV1E_SET_BITRATE_ONE_PASS_CBR, unsigned int)
#define AOM_CTRL_AV1E_SET_BITRATE_ONE_PASS_CBR
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
+#define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
+
/*!\endcond */
/*! @} - end defgroup aom_encoder */
#ifdef __cplusplus
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4c60e5c..f8f2cbb 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -115,12 +115,17 @@
"${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
"${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
"${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
"${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
"${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+
if(CONFIG_AV1_HIGHBITDEPTH)
list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
@@ -134,6 +139,11 @@
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
endif()
@@ -191,6 +201,9 @@
list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
endif()
list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
@@ -269,7 +282,15 @@
"${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
"${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
if(CONFIG_AV1_HIGHBITDEPTH)
list(APPEND AOM_DSP_ENCODER_ASM_SSE2
@@ -292,11 +313,20 @@
list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
"${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/highbd_sad4d_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c")
endif()
if(CONFIG_INTERNAL_STATS)
@@ -326,6 +356,10 @@
list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+
+ list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
endif()
endif()
@@ -433,6 +467,23 @@
endif()
endif()
+ if(HAVE_NEON_DOTPROD)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD")
+ endif()
+ endif()
+
+ if(HAVE_NEON_I8MM)
+ add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+ "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_NEON_I8MM")
+ endif()
+
target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
if(BUILD_SHARED_LIBS)
target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index efb634a..85dc005 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -23,10 +23,6 @@
#define PI 3.141592653589793238462643383279502884
-#ifndef MAX_SB_SIZE
-#define MAX_SB_SIZE 128
-#endif // ndef MAX_SB_SIZE
-
#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
#define AOMSIGN(x) ((x) < 0 ? -1 : 0)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e738971..c9b2682 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -497,22 +497,22 @@
add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve_copy neon sse2 avx2/;
-specialize qw/aom_convolve8_horiz neon sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert neon sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve_copy neon sse2 avx2/;
+specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/aom_scaled_2d ssse3 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
- specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+ specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/;
add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
- specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
+ specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/;
add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
- specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+ specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/;
}
#
@@ -750,7 +750,7 @@
add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
+specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/;
specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
@@ -759,10 +759,10 @@
add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
- specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
- specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
- specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
- specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+ specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/;
}
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -773,35 +773,33 @@
specialize qw/aom_subtract_block neon sse2 avx2/;
add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
- specialize qw/aom_sse sse4_1 avx2 neon/;
+ specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/;
add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
- specialize qw/aom_get_blk_sse_sum sse2 avx2/;
+ specialize qw/aom_get_blk_sse_sum sse2 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/aom_highbd_subtract_block sse2 neon/;
add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
- specialize qw/aom_highbd_sse sse4_1 avx2 neon/;
+ specialize qw/aom_highbd_sse sse4_1 avx2 neon/;
}
- if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- #
- # Sum of Squares
- #
- add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
- specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
+ #
+ # Sum of Squares
+ #
+ add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+ specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
- add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
- specialize qw/aom_sum_squares_i16 sse2 neon/;
+ add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+ specialize qw/aom_sum_squares_i16 sse2 neon/;
- add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
- specialize qw/aom_var_2d_u8 sse2 avx2 neon/;
+ add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/;
- add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
- specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
- }
+ add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
#
# Single block SAD / Single block Avg SAD
@@ -816,65 +814,65 @@
add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/;
- specialize qw/aom_sad128x128 avx2 neon sse2/;
- specialize qw/aom_sad128x64 avx2 neon sse2/;
- specialize qw/aom_sad64x128 avx2 neon sse2/;
- specialize qw/aom_sad64x64 avx2 neon sse2/;
- specialize qw/aom_sad64x32 avx2 neon sse2/;
- specialize qw/aom_sad32x64 avx2 neon sse2/;
- specialize qw/aom_sad32x32 avx2 neon sse2/;
- specialize qw/aom_sad32x16 avx2 neon sse2/;
- specialize qw/aom_sad16x32 neon sse2/;
- specialize qw/aom_sad16x16 neon sse2/;
- specialize qw/aom_sad16x8 neon sse2/;
- specialize qw/aom_sad8x16 neon sse2/;
- specialize qw/aom_sad8x8 neon sse2/;
- specialize qw/aom_sad8x4 neon sse2/;
- specialize qw/aom_sad4x8 neon sse2/;
- specialize qw/aom_sad4x4 neon sse2/;
+ specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16 sse2 neon/;
+ specialize qw/aom_sad8x8 sse2 neon/;
+ specialize qw/aom_sad8x4 sse2 neon/;
+ specialize qw/aom_sad4x8 sse2 neon/;
+ specialize qw/aom_sad4x4 sse2 neon/;
- specialize qw/aom_sad4x16 neon sse2/;
- specialize qw/aom_sad16x4 neon sse2/;
- specialize qw/aom_sad8x32 neon sse2/;
- specialize qw/aom_sad32x8 neon sse2/;
- specialize qw/aom_sad16x64 neon sse2/;
- specialize qw/aom_sad64x16 neon sse2/;
+ specialize qw/aom_sad4x16 sse2 neon/;
+ specialize qw/aom_sad16x4 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32 sse2 neon/;
+ specialize qw/aom_sad32x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16 sse2 neon neon_dotprod/;
- specialize qw/aom_sad_skip_128x128 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_128x64 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x128 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x64 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x32 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x64 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x32 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x16 avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x32 sse2 neon/;
- specialize qw/aom_sad_skip_16x16 sse2 neon/;
- specialize qw/aom_sad_skip_16x8 sse2 neon/;
- specialize qw/aom_sad_skip_8x16 sse2 neon/;
- specialize qw/aom_sad_skip_8x8 sse2 neon/;
- specialize qw/aom_sad_skip_8x4 neon/;
- specialize qw/aom_sad_skip_4x8 sse2 neon/;
- specialize qw/aom_sad_skip_4x4 neon/;
+ specialize qw/aom_sad_skip_128x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_128x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x16 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x32 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x16 sse2 neon/;
+ specialize qw/aom_sad_skip_8x8 sse2 neon/;
+ specialize qw/aom_sad_skip_8x4 neon/;
+ specialize qw/aom_sad_skip_4x8 sse2 neon/;
+ specialize qw/aom_sad_skip_4x4 neon/;
- specialize qw/aom_sad_skip_4x16 sse2 neon/;
- specialize qw/aom_sad_skip_16x4 neon/;
- specialize qw/aom_sad_skip_8x32 sse2 neon/;
- specialize qw/aom_sad_skip_32x8 sse2 neon/;
- specialize qw/aom_sad_skip_16x64 sse2 neon/;
- specialize qw/aom_sad_skip_64x16 sse2 neon/;
+ specialize qw/aom_sad_skip_4x16 sse2 neon/;
+ specialize qw/aom_sad_skip_16x4 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x32 sse2 neon/;
+ specialize qw/aom_sad_skip_32x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x64 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x16 sse2 neon neon_dotprod/;
- specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
- specialize qw/aom_sad128x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x128_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x32_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x32_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x16_avg avx2 sse2 neon/;
- specialize qw/aom_sad16x32_avg sse2 neon/;
- specialize qw/aom_sad16x16_avg sse2 neon/;
- specialize qw/aom_sad16x8_avg sse2 neon/;
+ specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8_avg sse2 neon neon_dotprod/;
specialize qw/aom_sad8x16_avg sse2 neon/;
specialize qw/aom_sad8x8_avg sse2 neon/;
specialize qw/aom_sad8x4_avg sse2 neon/;
@@ -882,36 +880,36 @@
specialize qw/aom_sad4x4_avg sse2 neon/;
specialize qw/aom_sad4x16_avg sse2 neon/;
- specialize qw/aom_sad16x4_avg sse2 neon/;
+ specialize qw/aom_sad16x4_avg sse2 neon neon_dotprod/;
specialize qw/aom_sad8x32_avg sse2 neon/;
- specialize qw/aom_sad32x8_avg sse2 neon/;
- specialize qw/aom_sad16x64_avg sse2 neon/;
- specialize qw/aom_sad64x16_avg sse2 neon/;
+ specialize qw/aom_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16_avg sse2 neon neon_dotprod/;
- specialize qw/aom_dist_wtd_sad128x128_avg sse2/;
- specialize qw/aom_dist_wtd_sad128x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x128_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x4_avg sse2/;
- specialize qw/aom_dist_wtd_sad4x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad4x4_avg sse2/;
+ specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad128x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x128_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad8x16_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad8x8_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad8x4_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad4x8_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad4x4_avg sse2 neon/;
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- specialize qw/aom_dist_wtd_sad4x16_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x4_avg sse2/;
- specialize qw/aom_dist_wtd_sad8x32_avg sse2/;
- specialize qw/aom_dist_wtd_sad32x8_avg sse2/;
- specialize qw/aom_dist_wtd_sad16x64_avg sse2/;
- specialize qw/aom_dist_wtd_sad64x16_avg sse2/;
+ specialize qw/aom_dist_wtd_sad4x16_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad16x4_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad8x32_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x16_avg sse2 neon neon_dotprod/;
}
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -974,27 +972,29 @@
specialize qw/aom_highbd_sad_skip_16x64 avx2 sse2 neon/;
specialize qw/aom_highbd_sad_skip_64x16 avx2 sse2 neon/;
- specialize qw/aom_highbd_sad128x128_avg avx2/;
- specialize qw/aom_highbd_sad128x64_avg avx2/;
- specialize qw/aom_highbd_sad64x128_avg avx2/;
- specialize qw/aom_highbd_sad64x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad64x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x16_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x16_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x8_avg avx2 sse2/;
- specialize qw/aom_highbd_sad8x4_avg sse2/;
- specialize qw/aom_highbd_sad4x8_avg sse2/;
- specialize qw/aom_highbd_sad4x4_avg sse2/;
+ specialize qw/aom_highbd_sad128x128_avg avx2 neon/;
+ specialize qw/aom_highbd_sad128x64_avg avx2 neon/;
+ specialize qw/aom_highbd_sad64x128_avg avx2 neon/;
+ specialize qw/aom_highbd_sad64x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x16_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x16_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x8_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad8x16_avg neon/;
+ specialize qw/aom_highbd_sad8x8_avg neon/;
+ specialize qw/aom_highbd_sad8x4_avg sse2 neon/;
+ specialize qw/aom_highbd_sad4x8_avg sse2 neon/;
+ specialize qw/aom_highbd_sad4x4_avg sse2 neon/;
- specialize qw/aom_highbd_sad4x16_avg sse2/;
- specialize qw/aom_highbd_sad16x4_avg avx2 sse2/;
- specialize qw/aom_highbd_sad8x32_avg sse2/;
- specialize qw/aom_highbd_sad32x8_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad64x16_avg avx2 sse2/;
+ specialize qw/aom_highbd_sad4x16_avg sse2 neon/;
+ specialize qw/aom_highbd_sad8x32_avg sse2 neon/;
+ specialize qw/aom_highbd_sad16x4_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x8_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x16_avg avx2 sse2 neon/;
}
#
# Masked SAD
@@ -1009,7 +1009,7 @@
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
- specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
+ specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
}
}
@@ -1030,7 +1030,7 @@
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
- specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+ specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
}
}
}
@@ -1047,47 +1047,47 @@
add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
}
- specialize qw/aom_sad128x128x4d avx2 neon sse2/;
- specialize qw/aom_sad128x64x4d avx2 neon sse2/;
- specialize qw/aom_sad64x128x4d avx2 neon sse2/;
- specialize qw/aom_sad64x64x4d avx2 neon sse2/;
- specialize qw/aom_sad64x32x4d avx2 neon sse2/;
- specialize qw/aom_sad32x64x4d avx2 neon sse2/;
- specialize qw/aom_sad32x32x4d avx2 neon sse2/;
- specialize qw/aom_sad32x16x4d avx2 neon sse2/;
- specialize qw/aom_sad16x32x4d avx2 neon sse2/;
- specialize qw/aom_sad16x16x4d avx2 neon sse2/;
- specialize qw/aom_sad16x8x4d avx2 neon sse2/;
+ specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8x4d avx2 sse2 neon neon_dotprod/;
- specialize qw/aom_sad8x16x4d neon sse2/;
- specialize qw/aom_sad8x8x4d neon sse2/;
- specialize qw/aom_sad8x4x4d neon sse2/;
- specialize qw/aom_sad4x8x4d neon sse2/;
- specialize qw/aom_sad4x4x4d neon sse2/;
+ specialize qw/aom_sad8x16x4d sse2 neon/;
+ specialize qw/aom_sad8x8x4d sse2 neon/;
+ specialize qw/aom_sad8x4x4d sse2 neon/;
+ specialize qw/aom_sad4x8x4d sse2 neon/;
+ specialize qw/aom_sad4x4x4d sse2 neon/;
- specialize qw/aom_sad64x16x4d avx2 neon sse2/;
- specialize qw/aom_sad32x8x4d avx2 neon sse2/;
- specialize qw/aom_sad16x64x4d avx2 neon sse2/;
- specialize qw/aom_sad16x4x4d avx2 neon sse2/;
- specialize qw/aom_sad8x32x4d neon sse2/;
- specialize qw/aom_sad4x16x4d neon sse2/;
+ specialize qw/aom_sad64x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x8x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x4x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32x4d sse2 neon/;
+ specialize qw/aom_sad4x16x4d sse2 neon/;
- specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon/;
+ specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon neon_dotprod/;
- specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x4x4d neon/;
+ specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/;
specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
specialize qw/aom_sad_skip_8x16x4d sse2 neon/;
specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
@@ -1096,29 +1096,29 @@
specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
specialize qw/aom_sad_skip_4x4x4d neon/;
- specialize qw/aom_sad128x128x3d neon avx2/;
- specialize qw/aom_sad128x64x3d neon avx2/;
- specialize qw/aom_sad64x128x3d neon avx2/;
- specialize qw/aom_sad64x64x3d neon avx2/;
- specialize qw/aom_sad64x32x3d neon avx2/;
- specialize qw/aom_sad32x64x3d neon avx2/;
- specialize qw/aom_sad32x32x3d neon avx2/;
- specialize qw/aom_sad32x16x3d neon avx2/;
- specialize qw/aom_sad16x32x3d neon avx2/;
- specialize qw/aom_sad16x16x3d neon avx2/;
- specialize qw/aom_sad16x8x3d neon avx2/;
- specialize qw/aom_sad8x16x3d neon/;
- specialize qw/aom_sad8x8x3d neon/;
- specialize qw/aom_sad8x4x3d neon/;
- specialize qw/aom_sad4x8x3d neon/;
- specialize qw/aom_sad4x4x3d neon/;
+ specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16x3d neon/;
+ specialize qw/aom_sad8x8x3d neon/;
+ specialize qw/aom_sad8x4x3d neon/;
+ specialize qw/aom_sad4x8x3d neon/;
+ specialize qw/aom_sad4x4x3d neon/;
- specialize qw/aom_sad64x16x3d neon avx2/;
- specialize qw/aom_sad32x8x3d neon avx2/;
- specialize qw/aom_sad16x64x3d neon avx2/;
- specialize qw/aom_sad16x4x3d neon/;
- specialize qw/aom_sad8x32x3d neon/;
- specialize qw/aom_sad4x16x3d neon/;
+ specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x4x3d neon neon_dotprod/;
+ specialize qw/aom_sad8x32x3d neon/;
+ specialize qw/aom_sad4x16x3d neon/;
specialize qw/aom_masked_sad128x128x4d ssse3 neon/;
specialize qw/aom_masked_sad128x64x4d ssse3 neon/;
@@ -1153,9 +1153,9 @@
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
- add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
- add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
- add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
if ($w != 128 && $h != 128) {
specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
}
@@ -1208,22 +1208,29 @@
specialize qw/aom_highbd_sad_skip_16x64x4d avx2 sse2 neon/;
specialize qw/aom_highbd_sad_skip_64x16x4d avx2 sse2 neon/;
- specialize qw/aom_highbd_sad128x128x3d avx2/;
- specialize qw/aom_highbd_sad128x64x3d avx2/;
- specialize qw/aom_highbd_sad64x128x3d avx2/;
- specialize qw/aom_highbd_sad64x64x3d avx2/;
- specialize qw/aom_highbd_sad64x32x3d avx2/;
- specialize qw/aom_highbd_sad32x64x3d avx2/;
- specialize qw/aom_highbd_sad32x32x3d avx2/;
- specialize qw/aom_highbd_sad32x16x3d avx2/;
- specialize qw/aom_highbd_sad16x32x3d avx2/;
- specialize qw/aom_highbd_sad16x16x3d avx2/;
- specialize qw/aom_highbd_sad16x8x3d avx2/;
+ specialize qw/aom_highbd_sad128x128x3d avx2 neon/;
+ specialize qw/aom_highbd_sad128x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x128x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x8x3d avx2 neon/;
+ specialize qw/aom_highbd_sad8x16x3d neon/;
+ specialize qw/aom_highbd_sad8x8x3d neon/;
+ specialize qw/aom_highbd_sad8x4x3d neon/;
+ specialize qw/aom_highbd_sad4x8x3d neon/;
+ specialize qw/aom_highbd_sad4x4x3d neon/;
- specialize qw/aom_highbd_sad16x4x3d avx2/;
- specialize qw/aom_highbd_sad32x8x3d avx2/;
- specialize qw/aom_highbd_sad16x64x3d avx2/;
- specialize qw/aom_highbd_sad64x16x3d avx2/;
+ specialize qw/aom_highbd_sad64x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x8x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x4x3d avx2 neon/;
+ specialize qw/aom_highbd_sad8x32x3d neon/;
+ specialize qw/aom_highbd_sad4x16x3d neon/;
}
#
# Avg
@@ -1323,20 +1330,20 @@
# Specialty Variance
#
add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
- specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon/;
+ specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon neon_dotprod/;
add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
- specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon/;
+ specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon neon_dotprod/;
add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_mse16x16 sse2 avx2 neon/;
- specialize qw/aom_mse16x8 sse2 neon/;
- specialize qw/aom_mse8x16 sse2 neon/;
- specialize qw/aom_mse8x8 sse2 neon/;
+ specialize qw/aom_mse16x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_mse16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_mse8x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_mse8x8 sse2 neon neon_dotprod/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
foreach $bd (8, 10, 12) {
@@ -1345,31 +1352,32 @@
add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
- specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+ specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_mse16x8", qw/neon/;
+ specialize "aom_highbd_${bd}_mse8x16", qw/neon/;
+ specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon/;
}
+
+ specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
}
#
#
#
add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
- specialize qw/aom_get_mb_ss sse2/;
+ specialize qw/aom_get_mb_ss sse2 neon/;
#
# Variance / Subpixel Variance / Subpixel Avg Variance
#
- add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
specialize qw/aom_mse_wxh_16bit sse2 avx2 neon/;
add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
- specialize qw/aom_mse_16xh_16bit sse2 avx2/;
+ specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
@@ -1378,22 +1386,22 @@
add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
}
- specialize qw/aom_variance128x128 sse2 avx2 neon/;
- specialize qw/aom_variance128x64 sse2 avx2 neon/;
- specialize qw/aom_variance64x128 sse2 avx2 neon/;
- specialize qw/aom_variance64x64 sse2 avx2 neon/;
- specialize qw/aom_variance64x32 sse2 avx2 neon/;
- specialize qw/aom_variance32x64 sse2 avx2 neon/;
- specialize qw/aom_variance32x32 sse2 avx2 neon/;
- specialize qw/aom_variance32x16 sse2 avx2 neon/;
- specialize qw/aom_variance16x32 sse2 avx2 neon/;
- specialize qw/aom_variance16x16 sse2 avx2 neon/;
- specialize qw/aom_variance16x8 sse2 avx2 neon/;
- specialize qw/aom_variance8x16 sse2 neon/;
- specialize qw/aom_variance8x8 sse2 neon/;
- specialize qw/aom_variance8x4 sse2 neon/;
- specialize qw/aom_variance4x8 sse2 neon/;
- specialize qw/aom_variance4x4 sse2 neon/;
+ specialize qw/aom_variance128x128 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance128x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x128 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x8 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance8x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance8x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance8x4 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance4x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance4x4 sse2 neon neon_dotprod/;
specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/;
specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/;
@@ -1430,12 +1438,12 @@
specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/;
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- specialize qw/aom_variance4x16 neon sse2/;
- specialize qw/aom_variance16x4 neon sse2 avx2/;
- specialize qw/aom_variance8x32 neon sse2/;
- specialize qw/aom_variance32x8 neon sse2 avx2/;
- specialize qw/aom_variance16x64 neon sse2 avx2/;
- specialize qw/aom_variance64x16 neon sse2 avx2/;
+ specialize qw/aom_variance4x16 neon neon_dotprod sse2/;
+ specialize qw/aom_variance16x4 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance8x32 neon neon_dotprod sse2/;
+ specialize qw/aom_variance32x8 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
@@ -1450,82 +1458,259 @@
specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/;
}
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 neon ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 ssse3/;
- specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 neon ssse3/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
foreach $bd (8, 10, 12) {
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
- if ($bd == 10) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse2 neon/;
- } else {
- specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
- }
- }
-
- if ($w == 4 || $h == 4) {
- # TODO(rachelbarker): When ext-partition-types is enabled, we currently
- # don't have vectorized 4x16 highbd variance functions
- if ($w == 4 && $h == 4) {
- if ($bd == 10) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse4_1 neon/;
- } else {
- specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
- }
- } else {
- if ($bd == 10) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", qw/neon/;
- }
- }
- }
-
-
- if ($w != 128 && $h != 128 && $w != 4) {
- specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
- specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
- }
- if ($w == 4 && $h == 4) {
- specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
- specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
- }
-
add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
}
}
+
+ specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_12_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_12_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_12_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_variance8x4 neon/;
+ specialize qw/aom_highbd_12_variance4x8 neon/;
+ specialize qw/aom_highbd_12_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_variance8x4 neon/;
+ specialize qw/aom_highbd_10_variance4x8 neon/;
+ specialize qw/aom_highbd_10_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_8_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_8_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_8_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_variance8x4 neon/;
+ specialize qw/aom_highbd_8_variance4x8 neon/;
+ specialize qw/aom_highbd_8_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ my $avx2 = ($bd == 10) ? "avx2" : "";
+ specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
+ }
+ }
+
+ specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/;
+ }
+ }
+
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/;
+ }
+ }
+
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4" , qw/neon/;
+ }
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/;
+ }
+ }
}
#
# Masked Variance / Masked Subpixel Variance
@@ -1541,7 +1726,7 @@
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
- specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+ specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
}
}
}
@@ -1559,56 +1744,18 @@
}
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
- foreach $bd ("_", "_10_", "_12_") {
+ foreach $bd ("_8_", "_10_", "_12_") {
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+ specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/;
+ specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/;
}
}
}
}
- add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance64x64 avx2 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance64x32 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x64 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x32 avx2 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x16 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x32 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x16 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x8 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x16 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x8 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x4 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance4x8 sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance4x4 sse2 ssse3/;
-
#
# Comp Avg
#
@@ -1616,469 +1763,25 @@
specialize qw/aom_comp_avg_pred avx2 neon/;
add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
- specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
+ specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-
- add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance4x8 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance4x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance4x8 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance4x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x128 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x64 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x32 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x4 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance4x8 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance4x4 neon/;
-
- if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- foreach $bd (8, 10, 12) {
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance64x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance64x16" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance32x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance32x8" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x64", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance16x64" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance8x32", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance8x32" , qw/neon/;
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
- }
- }
-
- add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse16x8 neon/;
- add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse8x16 neon/;
- add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse16x8 neon/;
- add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse8x16 neon/;
- add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse8x8 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse16x16 sse2 neon/;
-
- add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse16x8 neon/;
- add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse8x16 neon/;
- add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse8x8 sse2 neon/;
-
add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+ specialize qw/aom_highbd_comp_avg_pred neon/;
add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
- specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+ specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/;
add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
- specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2/;
+ specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon/;
}
- #
- # Subpixel Variance
- #
- if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- }
-
add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
specialize qw/aom_comp_mask_pred ssse3 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
- specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+ specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/;
}
# Flow estimation library
@@ -2087,7 +1790,7 @@
specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
- specialize qw/aom_compute_flow_at_point sse4_1/;
+ specialize qw/aom_compute_flow_at_point sse4_1 neon/;
}
} # CONFIG_AV1_ENCODER
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h
index ab950ca..69da8f2 100644
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -24,12 +24,10 @@
#define SIMD_CHECK 1 // Sanity checks in C equivalents
-#if HAVE_NEON
-#include "simd/v256_intrinsics_arm.h"
// VS compiling for 32 bit targets does not support vector types in
// structs as arguments, which makes the v256 type of the intrinsics
// hard to support, so optimizations for this target are disabled.
-#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
#include "simd/v256_intrinsics_x86.h"
#else
#include "simd/v256_intrinsics.h"
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 3d07a0f..c8ee780 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -24,826 +24,6 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
-#if AOM_ARCH_AARCH64 && \
- (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
- 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
- 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
- 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
- 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
- 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
- /* Shift left and insert new last column in transposed 4x4 block. */
- 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
- /* Shift left and insert two new columns in transposed 4x4 block. */
- 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
- /* Shift left and insert three new columns in transposed 4x4 block. */
- 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
-};
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
- const int8x8_t filter,
- const uint8x16x2_t permute_tbl) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
- const int8x8_t filter,
- const uint8x16x3_t permute_tbl) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
- sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
- sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
- uint8x16_t s0, s1, s2, s3;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)x_step_q4;
- (void)filter_y;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1);
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
-
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_usdot(s0, filter, perm_tbl);
- t1 = convolve8_4_usdot(s1, filter, perm_tbl);
- t2 = convolve8_4_usdot(s2, filter, perm_tbl);
- t3 = convolve8_4_usdot(s3, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
- d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- }
-}
-
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b = vqtbl2q_u8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b0, uint8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
- const uint8x16_t samples_hi,
- const int8x8_t filter) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum;
-
- sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
- sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
- const uint8x16_t samples0_hi,
- const uint8x16_t samples1_lo,
- const uint8x16_t samples1_hi,
- const int8x8_t filter) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
- sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
- sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)filter_x;
- (void)x_step_q4;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src += 7 * src_stride;
-
- s7 = vdup_n_u8(0);
- s8 = vdup_n_u8(0);
- s9 = vdup_n_u8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
- do {
- load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
- d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
- d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
- d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123 = s4567;
- s1234 = s5678;
- s2345 = s6789;
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
-
- load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- s7 = vdup_n_u8(0);
- s8 = vdup_n_u8(0);
- s9 = vdup_n_u8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
- tran_concat_tbl);
-
- do {
- load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- filter);
- d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- filter);
- d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- filter);
- d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123_lo = s4567_lo;
- s0123_hi = s4567_hi;
- s1234_lo = s5678_lo;
- s1234_hi = s5678_hi;
- s2345_lo = s6789_lo;
- s2345_hi = s6789_hi;
- s3456_lo = s78910_lo;
- s3456_hi = s78910_hi;
-
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-#else // !defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
- const int8x8_t filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
- const int8x8_t filter,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
- sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
- sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x16_t range_limit = vdupq_n_u8(128);
- uint8x16_t s0, s1, s2, s3;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)x_step_q4;
- (void)filter_y;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1);
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
-
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 0);
- }
-}
-
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b0,
- int8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filter) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
- sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
- const int8x16_t samples0_hi,
- const int8x16_t samples1_lo,
- const int8x16_t samples1_hi,
- const int32x4_t correction,
- const int8x8_t filter) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
- sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
- sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x8_t range_limit = vdup_n_u8(128);
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
-
- (void)filter_x;
- (void)x_step_q4;
- (void)y_step_q4;
-
- src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- src += 7 * src_stride;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
- s7 = vdup_n_s8(0);
- s8 = vdup_n_s8(0);
- s9 = vdup_n_s8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
- transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
- transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
- transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
- do {
- uint8x8_t t7, t8, t9, t10;
-
- load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
- d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
- d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
- d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8_4x1(dst + 0 * dst_stride, d01, 0);
- store_u8_4x1(dst + 1 * dst_stride, d01, 1);
- store_u8_4x1(dst + 2 * dst_stride, d23, 0);
- store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123 = s4567;
- s1234 = s5678;
- s2345 = s6789;
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
-
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s += 7 * src_stride;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
- s7 = vdup_n_s8(0);
- s8 = vdup_n_s8(0);
- s9 = vdup_n_s8(0);
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
- tran_concat_tbl);
-
- do {
- uint8x8_t t7, t8, t9, t10;
-
- load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filter);
- d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filter);
- d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filter);
- d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
- s0123_lo = s4567_lo;
- s0123_hi = s4567_hi;
- s1234_lo = s5678_lo;
- s1234_hi = s5678_hi;
- s2345_lo = s6789_lo;
- s2345_hi = s6789_hi;
- s3456_lo = s78910_lo;
- s3456_hi = s78910_hi;
-
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-#endif // defined(__ARM_FEATURE_MATMUL_INT8)
-
-#else // !(AOM_ARCH_AARCH64 &&
- // (defined(__ARM_FEATURE_DOTPROD) ||
- // defined(__ARM_FEATURE_MATMUL_INT8)))
-
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
@@ -905,7 +85,7 @@
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -918,7 +98,7 @@
do {
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -931,7 +111,7 @@
d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
- transpose_u8_4x4(&d01, &d23);
+ transpose_elems_inplace_u8_4x4(&d01, &d23);
store_u8_4x1(dst + 0 * dst_stride, d01, 0);
store_u8_4x1(dst + 1 * dst_stride, d23, 0);
@@ -956,7 +136,7 @@
if (w == 4) {
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -967,7 +147,8 @@
load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
&t7);
- transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+ transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+ &t3);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -978,7 +159,7 @@
d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- transpose_u8_8x4(&d0, &d1, &d2, &d3);
+ transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
store_u8_4x1(dst + 0 * dst_stride, d0, 0);
store_u8_4x1(dst + 1 * dst_stride, d1, 0);
@@ -1002,7 +183,7 @@
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1017,7 +198,8 @@
do {
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1036,7 +218,8 @@
d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
- transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
@@ -1172,5 +355,3 @@
} while (w != 0);
}
}
-
-#endif // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 0000000..e565414
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[2];
+ int32x4_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+ sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+ t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+ t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+ t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filter);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filter);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filter);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 0000000..d778e8a
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+ t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+ t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+ t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+ store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+ store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+ store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filter);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filter);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filter);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index 583d832..d746f9e 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -50,3 +50,104 @@
}
}
}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w < 8) { // copy4
+ uint16x4_t s0, s1;
+ do {
+ s0 = vld1_u16(src);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ src += src_stride;
+
+ vst1_u16(dst, s0);
+ dst += dst_stride;
+ vst1_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint16x8_t s0, s1;
+ do {
+ s0 = vld1q_u16(src);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ dst += dst_stride;
+ vst1q_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ src += src_stride;
+ s2 = vld1q_u16(src);
+ s3 = vld1q_u16(src + 8);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ dst += dst_stride;
+ vst1q_u16(dst, s2);
+ vst1q_u16(dst + 8, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ dst += dst_stride;
+ } while (--h != 0);
+ } else { // copy64
+ uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+ int width = w;
+ do {
+ s0 = vld1q_u16(s);
+ s1 = vld1q_u16(s + 8);
+ s2 = vld1q_u16(s + 16);
+ s3 = vld1q_u16(s + 24);
+ s4 = vld1q_u16(s + 32);
+ s5 = vld1q_u16(s + 40);
+ s6 = vld1q_u16(s + 48);
+ s7 = vld1q_u16(s + 56);
+
+ vst1q_u16(d, s0);
+ vst1q_u16(d + 8, s1);
+ vst1q_u16(d + 16, s2);
+ vst1q_u16(d + 24, s3);
+ vst1q_u16(d + 32, s4);
+ vst1q_u16(d + 40, s5);
+ vst1q_u16(d + 48, s6);
+ vst1q_u16(d + 56, s7);
+ s += 64;
+ d += 64;
+ width -= 64;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index ef2f3af..2e79b2e 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -10,6 +10,7 @@
#include <arm_neon.h>
#include <assert.h>
+#include <stdlib.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
@@ -19,75 +20,68 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
-#if !AOM_ARCH_AARCH64
-static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
- const uint32x4_t b = vpaddlq_u16(a);
- const uint64x2_t c = vpaddlq_u32(b);
- return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
- vreinterpret_u32_u64(vget_high_u64(c)));
-}
-#endif
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+ const uint8x8_t s0 = load_unaligned_u8(p, stride);
+ const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
-unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
- const uint8x16_t b = load_unaligned_u8q(a, a_stride);
- const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-#if AOM_ARCH_AARCH64
- const uint32_t d = vaddlvq_u16(c);
- return (d + 8) >> 4;
-#else
- const uint32x2_t d = horizontal_add_u16x8_v(c);
- return vget_lane_u32(vrshr_n_u32(d, 4), 0);
-#endif
+ const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+ return (sum + (1 << 3)) >> 4;
}
-unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
- uint16x8_t sum;
- uint8x8_t b = vld1_u8(a);
- a += a_stride;
- uint8x8_t c = vld1_u8(a);
- a += a_stride;
- sum = vaddl_u8(b, c);
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+ uint8x8_t s0 = vld1_u8(p);
+ p += stride;
+ uint8x8_t s1 = vld1_u8(p);
+ p += stride;
+ uint16x8_t acc = vaddl_u8(s0, s1);
- for (int i = 0; i < 6; ++i) {
- const uint8x8_t e = vld1_u8(a);
- a += a_stride;
- sum = vaddw_u8(sum, e);
- }
+ int i = 0;
+ do {
+ const uint8x8_t si = vld1_u8(p);
+ p += stride;
+ acc = vaddw_u8(acc, si);
+ } while (++i < 6);
-#if AOM_ARCH_AARCH64
- const uint32_t d = vaddlvq_u16(sum);
- return (d + 32) >> 6;
-#else
- const uint32x2_t d = horizontal_add_u16x8_v(sum);
- return vget_lane_u32(vrshr_n_u32(d, 6), 0);
-#endif
+ const uint32_t sum = horizontal_add_u16x8(acc);
+ return (sum + (1 << 5)) >> 6;
}
void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
int *avg) {
- for (int k = 0; k < 4; k++) {
- const int x8_idx = x16_idx + ((k & 1) << 3);
- const int y8_idx = y16_idx + ((k >> 1) << 3);
- const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
- avg[k] = aom_avg_8x8_neon(s_tmp, p);
- }
+ avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+ avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+ avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+ avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
}
int aom_satd_lp_neon(const int16_t *coeff, int length) {
- const int16x4_t zero = vdup_n_s16(0);
- int32x4_t accum = vdupq_n_s32(0);
+ int16x8_t s0 = vld1q_s16(coeff);
+ int16x8_t s1 = vld1q_s16(coeff + 8);
- do {
- const int16x8_t src0 = vld1q_s16(coeff);
- const int16x8_t src8 = vld1q_s16(coeff + 8);
- accum = vabal_s16(accum, vget_low_s16(src0), zero);
- accum = vabal_s16(accum, vget_high_s16(src0), zero);
- accum = vabal_s16(accum, vget_low_s16(src8), zero);
- accum = vabal_s16(accum, vget_high_s16(src8), zero);
+ int16x8_t abs0 = vabsq_s16(s0);
+ int16x8_t abs1 = vabsq_s16(s1);
+
+ int32x4_t acc0 = vpaddlq_s16(abs0);
+ int32x4_t acc1 = vpaddlq_s16(abs1);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s16(coeff);
+ s1 = vld1q_s16(coeff + 8);
+
+ abs0 = vabsq_s16(s0);
+ abs1 = vabsq_s16(s1);
+
+ acc0 = vpadalq_s16(acc0, abs0);
+ acc1 = vpadalq_s16(acc1, abs1);
+
length -= 16;
coeff += 16;
- } while (length != 0);
+ }
+ int32x4_t accum = vaddq_s32(acc0, acc1);
return horizontal_add_s32x4(accum);
}
@@ -180,56 +174,84 @@
} while (h < height);
}
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
int aom_satd_neon(const tran_low_t *coeff, int length) {
const int32x4_t zero = vdupq_n_s32(0);
- int32x4_t accum = zero;
- do {
- const int32x4_t src0 = vld1q_s32(&coeff[0]);
- const int32x4_t src8 = vld1q_s32(&coeff[4]);
- const int32x4_t src16 = vld1q_s32(&coeff[8]);
- const int32x4_t src24 = vld1q_s32(&coeff[12]);
- accum = vabaq_s32(accum, src0, zero);
- accum = vabaq_s32(accum, src8, zero);
- accum = vabaq_s32(accum, src16, zero);
- accum = vabaq_s32(accum, src24, zero);
+
+ int32x4_t s0 = vld1q_s32(&coeff[0]);
+ int32x4_t s1 = vld1q_s32(&coeff[4]);
+ int32x4_t s2 = vld1q_s32(&coeff[8]);
+ int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+ int32x4_t accum0 = vabsq_s32(s0);
+ int32x4_t accum1 = vabsq_s32(s2);
+ accum0 = vabaq_s32(accum0, s1, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s32(&coeff[0]);
+ s1 = vld1q_s32(&coeff[4]);
+ s2 = vld1q_s32(&coeff[8]);
+ s3 = vld1q_s32(&coeff[12]);
+
+ accum0 = vabaq_s32(accum0, s0, zero);
+ accum1 = vabaq_s32(accum1, s1, zero);
+ accum0 = vabaq_s32(accum0, s2, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
length -= 16;
coeff += 16;
- } while (length != 0);
+ }
- // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
- return horizontal_add_s32x4(accum);
+ // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+ return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
}
int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
- int32x4_t v_mean = vdupq_n_s32(0);
- int32x4_t v_sse = v_mean;
- int16x8_t v_ref, v_src;
- int16x4_t v_low;
+ assert(bwl >= 2 && bwl <= 5);
+ int width = 4 << bwl;
- int i, width = 4 << bwl;
- for (i = 0; i < width; i += 8) {
- v_ref = vld1q_s16(&ref[i]);
- v_src = vld1q_s16(&src[i]);
- const int16x8_t diff = vsubq_s16(v_ref, v_src);
- // diff: dynamic range [-510, 510], 10 bits.
- v_mean = vpadalq_s16(v_mean, diff);
- v_low = vget_low_s16(diff);
- v_sse = vmlal_s16(v_sse, v_low, v_low);
-#if AOM_ARCH_AARCH64
- v_sse = vmlal_high_s16(v_sse, diff, diff);
-#else
- const int16x4_t v_high = vget_high_s16(diff);
- v_sse = vmlal_s16(v_sse, v_high, v_high);
-#endif
- }
- const int mean = horizontal_add_s32x4(v_mean);
- const int sse = horizontal_add_s32x4(v_sse);
- const unsigned int mean_abs = mean >= 0 ? mean : -mean;
- // (mean * mean): dynamic range 31 bits.
- const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
- return var;
+ int16x8_t r = vld1q_s16(ref);
+ int16x8_t s = vld1q_s16(src);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff = vsubq_s16(r, s);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ int16x8_t v_mean = diff;
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ int32x4_t v_sse[2];
+ v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+
+ do {
+ r = vld1q_s16(ref);
+ s = vld1q_s16(src);
+
+ diff = vsubq_s16(r, s);
+ v_mean = vaddq_s16(v_mean, diff);
+
+ v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ // Dynamic range [0, 65280], 16 (unsigned) bits.
+ const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+ const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+ // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+ return sse - ((mean_abs * mean_abs) >> (bwl + 2));
}
void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
diff --git a/aom_dsp/arm/avg_pred_neon.c b/aom_dsp/arm/avg_pred_neon.c
index 04e0904..b17f7fc 100644
--- a/aom_dsp/arm/avg_pred_neon.c
+++ b/aom_dsp/arm/avg_pred_neon.c
@@ -13,6 +13,9 @@
#include <assert.h>
#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/blend.h"
@@ -74,6 +77,75 @@
}
}
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ if (width > 8) {
+ do {
+ const uint8_t *pred_ptr = pred;
+ const uint8_t *ref_ptr = ref;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred_ptr, wtd_avg);
+
+ ref_ptr += 16;
+ pred_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ pred += width;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ int h = height / 2;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ } else {
+ int h = height / 2;
+ assert(width == 4);
+
+ do {
+ const uint8x8_t p = vld1_u8(pred);
+ const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+ const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+ vget_low_u8(bck_offset));
+
+ vst1_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 8;
+ comp_pred += 8;
+ } while (--h != 0);
+ }
+}
+
void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride,
const uint8_t *mask, int mask_stride,
@@ -84,7 +156,6 @@
const int src_stride1 = invert_mask ? ref_stride : width;
if (width > 8) {
- const uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
do {
const uint8_t *src0_ptr = src0;
const uint8_t *src1_ptr = src1;
@@ -97,19 +168,7 @@
const uint8x16_t s1 = vld1q_u8(src1_ptr);
const uint8x16_t m0 = vld1q_u8(mask_ptr);
- uint8x16_t m0_inv = vsubq_u8(max_alpha, m0);
- uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(m0));
- uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(m0));
- blend_u16_lo =
- vmlal_u8(blend_u16_lo, vget_low_u8(s1), vget_low_u8(m0_inv));
- blend_u16_hi =
- vmlal_u8(blend_u16_hi, vget_high_u8(s1), vget_high_u8(m0_inv));
-
- uint8x8_t blend_u8_lo =
- vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
- uint8x8_t blend_u8_hi =
- vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
- uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
vst1q_u8(comp_pred_ptr, blend_u8);
@@ -126,17 +185,12 @@
comp_pred += width;
} while (--height != 0);
} else if (width == 8) {
- const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
-
do {
const uint8x8_t s0 = vld1_u8(src0);
const uint8x8_t s1 = vld1_u8(src1);
const uint8x8_t m0 = vld1_u8(mask);
- uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
- uint16x8_t blend_u16 = vmull_u8(s0, m0);
- blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
vst1_u8(comp_pred, blend_u8);
@@ -146,7 +200,6 @@
comp_pred += 8;
} while (--height != 0);
} else {
- const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
int h = height / 2;
assert(width == 4);
@@ -155,10 +208,7 @@
const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
- uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
- uint16x8_t blend_u16 = vmull_u8(s0, m0);
- blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
vst1_u8(comp_pred, blend_u8);
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index c3ee0b7..7b1b66a 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -12,117 +12,34 @@
#include <arm_neon.h>
#include <assert.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_ports/mem.h"
#include "config/aom_dsp_rtcd.h"
-static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
- const int16x8_t v_maxval, int16x8_t *res) {
- int32x4_t im_res_low, im_res_high;
- const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
- im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
- im_res_low =
- vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+ uint16x8_t round_offset) {
+ const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
- im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
- im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
- vget_high_s16(src_1));
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
- *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
- vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
-}
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
-static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
- const CONV_BUF_TYPE *src0, uint32_t src0_stride,
- const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
- int16x8_t mask3, const int16x8_t v_maxval,
- const uint16x8_t vec_round_offset,
- const int16x8_t vec_round_bits) {
- int16x8_t src0_0, src0_1, src0_2, src0_3;
- int16x8_t src1_0, src1_1, src1_2, src1_3;
- int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+ uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
- load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
- &src0_3);
- load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
- &src1_3);
+ uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
- blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
- blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
- blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
- blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+ res = vqsubq_u16(res, round_offset);
- uint16x8_t im_res1_0 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
- uint16x8_t im_res1_1 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
- uint16x8_t im_res1_2 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
- uint16x8_t im_res1_3 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
-
- im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
- im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
- im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
- im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
-
- vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
- vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
- vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
- vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
-}
-
-static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
- const CONV_BUF_TYPE *src0, uint32_t src0_stride,
- const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
- int16x4_t mask3, const int16x8_t v_maxval,
- const uint16x8_t vec_round_offset,
- const int16x8_t vec_round_bits) {
- int16x8_t src0_0, src0_1;
- int16x8_t src1_0, src1_1;
- uint16x8_t tu0 = vdupq_n_u16(0);
- uint16x8_t tu1 = vdupq_n_u16(0);
- uint16x8_t tu2 = vdupq_n_u16(0);
- uint16x8_t tu3 = vdupq_n_u16(0);
- int16x8_t mask0_1, mask2_3;
- int16x8_t res0, res1;
-
- load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
- load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
-
- src0_0 = vreinterpretq_s16_u16(tu0);
- src0_1 = vreinterpretq_s16_u16(tu1);
-
- src1_0 = vreinterpretq_s16_u16(tu2);
- src1_1 = vreinterpretq_s16_u16(tu3);
-
- mask0_1 = vcombine_s16(mask0, mask1);
- mask2_3 = vcombine_s16(mask2, mask3);
-
- blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
- blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
-
- uint16x8_t im_res_0 =
- vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
- uint16x8_t im_res_1 =
- vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
-
- src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
- src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
-
- uint8x8_t res_0 = vqmovun_s16(src0_0);
- uint8x8_t res_1 = vqmovun_s16(src0_1);
-
- store_unaligned_u8_4x1(dst + 0 * dst_stride, res_0, 0);
- store_unaligned_u8_4x1(dst + 1 * dst_stride, res_0, 1);
- store_unaligned_u8_4x1(dst + 2 * dst_stride, res_1, 0);
- store_unaligned_u8_4x1(dst + 3 * dst_stride, res_1, 1);
+ return vqrshrn_n_u16(res,
+ 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
}
void aom_lowbd_blend_a64_d16_mask_neon(
@@ -130,19 +47,13 @@
uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
ConvolveParams *conv_params) {
- int i = 0;
- const int bd = 8;
- int w_tmp = w;
- const uint8_t *mask_tmp = mask;
- const CONV_BUF_TYPE *src0_tmp = src0;
- const CONV_BUF_TYPE *src1_tmp = src1;
- uint8_t *dst_tmp = dst;
+ (void)conv_params;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
@@ -152,294 +63,430 @@
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- uint8x8_t s0 = vdup_n_u8(0);
- uint8x8_t s1 = vdup_n_u8(0);
- uint8x8_t s2 = vdup_n_u8(0);
- uint8x8_t s3 = vdup_n_u8(0);
- uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
- int16x8_t mask0, mask1, mask2, mask3;
- int16x8_t mask4, mask5, mask6, mask7;
- int32x4_t m0_32, m1_32, m2_32, m3_32;
- int32x4_t m4_32, m5_32, m6_32, m7_32;
- uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
- uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
- int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
- const uint16x4_t vec_zero = vdup_n_u16(0);
- const uint16_t offset = round_offset - (1 << (round_bits - 1));
- const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
- const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
- const uint16x8_t vec_offset = vdupq_n_u16(offset);
-
if (subw == 0 && subh == 0) {
- if (w_tmp > 7) {
+ if (w >= 8) {
do {
- w_tmp = w;
+ int i = 0;
do {
- load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
- mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
- mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
- mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
- mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
- w_tmp -= 8;
- mask_tmp += 8;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (4 * mask_stride) - w;
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_unaligned_u8_4x4(mask_tmp, mask_stride, &s0, &s1);
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
- mask0 = vreinterpretq_s16_u16(vmovl_u8(s0));
- mask1 = vreinterpretq_s16_u16(vmovl_u8(s1));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
- mask0_low = vget_low_s16(mask0);
- mask1_low = vget_high_s16(mask0);
- mask2_low = vget_low_s16(mask1);
- mask3_low = vget_high_s16(mask1);
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (4 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
} else if (subw == 1 && subh == 1) {
- if (w_tmp > 7) {
+ if (w >= 8) {
do {
- w_tmp = w;
+ int i = 0;
do {
- load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
- &t7);
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
- mask0 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
- mask1 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
- mask2 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
- mask3 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
- mask4 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
- mask5 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
- mask6 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
- mask7 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- m0_32 = vpaddlq_s16(mask0);
- m1_32 = vpaddlq_s16(mask1);
- m2_32 = vpaddlq_s16(mask2);
- m3_32 = vpaddlq_s16(mask3);
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
- m4_32 = vpaddlq_s16(mask4);
- m5_32 = vpaddlq_s16(mask5);
- m6_32 = vpaddlq_s16(mask6);
- m7_32 = vpaddlq_s16(mask7);
-
- mask0 =
- vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
- mask1 =
- vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
- mask2 =
- vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
- mask3 =
- vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 16;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (8 * mask_stride) - (2 * w);
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
- mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
- mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
- mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- m0_32 = vpaddlq_s16(mask0);
- m1_32 = vpaddlq_s16(mask1);
- m2_32 = vpaddlq_s16(mask2);
- m3_32 = vpaddlq_s16(mask3);
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
- mask0_low = vqrshrn_n_s32(m0_32, 2);
- mask1_low = vqrshrn_n_s32(m1_32, 2);
- mask2_low = vqrshrn_n_s32(m2_32, 2);
- mask3_low = vqrshrn_n_s32(m3_32, 2);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (8 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
} else if (subw == 1 && subh == 0) {
- if (w_tmp > 7) {
+ if (w >= 8) {
do {
- w_tmp = w;
+ int i = 0;
do {
- load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
- mask0 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
- mask1 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
- mask2 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
- mask3 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
- mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
- mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
- w_tmp -= 8;
- mask_tmp += 16;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (4 * mask_stride) - (2 * w);
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l);
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
- mask0 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
- mask1 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
- mask2 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
- mask3 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
- mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
- mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
- mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (4 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
} else {
- if (w_tmp > 7) {
+ if (w >= 8) {
do {
- w_tmp = w;
+ int i = 0;
do {
- load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
- mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
- mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
- mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
- mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
- mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 8;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (8 * mask_stride) - w;
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
} else {
do {
- load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &s0, &s1);
- load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &s2,
- &s3);
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
- mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
- mask0_low = vget_low_s16(mask0);
- mask1_low = vget_high_s16(mask0);
- mask2_low = vget_low_s16(mask1);
- mask3_low = vget_high_s16(mask1);
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
- i += 4;
- mask_tmp += (8 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if ((subw | subh) == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+ uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t m1 = vld1_u8(mask + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
}
}
}
diff --git a/aom_dsp/arm/blend_neon.h b/aom_dsp/arm/blend_neon.h
new file mode 100644
index 0000000..c8a03224
--- /dev/null
+++ b/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+ uint8x16_t b) {
+ const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+ uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+ blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+ blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+ uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+ uint8x8_t b) {
+ const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16 = vmull_u8(m, a);
+
+ blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+ return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+ uint16x8_t b) {
+ uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+ uint16x4_t blend_u16_lo =
+ vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi =
+ vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+ uint16x4_t b) {
+ const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u16 = vmull_u16(m, a);
+
+ blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+ return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+ return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+ return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+ uint8x8_t c, uint8x8_t d) {
+ uint8x8_t a_c = vpadd_u8(a, c);
+ uint8x8_t b_d = vpadd_u8(b, d);
+ return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+ uint8x16_t c,
+ uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+ uint8x16_t a_c = vpaddq_u8(a, c);
+ uint8x16_t b_d = vpaddq_u8(b, d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+ uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+ uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+ uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif // AOM_ARCH_AARCH64
+}
+
+#endif // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/aom_dsp/arm/blk_sse_sum_neon.c b/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 0000000..f2ada93
--- /dev/null
+++ b/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ do {
+ int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += 2 * stride;
+ i -= 2;
+ } while (i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+ // value for bh is 32, we don't have to worry about sse overflowing.
+
+ do {
+ int16x8_t d = vld1q_s16(data);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += stride;
+ } while (--i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+ int bw, int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum = vdupq_n_s32(0);
+ int64x2_t sse = vdupq_n_s64(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t vector that
+ // means we can process up to (127*4)/bw rows before we need to widen to
+ // 64 bits.
+
+ int i_limit = (127 * 4) / bw;
+ int i_tmp = bh > i_limit ? i_limit : bh;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ do {
+ int j = bw;
+ const int16_t *data_ptr = data;
+ do {
+ int16x8_t d = vld1q_s16(data_ptr);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+ data_ptr += 8;
+ j -= 8;
+ } while (j != 0);
+
+ data += stride;
+ i++;
+ } while (i < i_tmp && i < bh);
+
+ sse = vpadalq_s32(sse, sse_s32);
+ i_tmp += i_limit;
+ } while (i < bh);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ if (bw == 4) {
+ get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else if (bw == 8) {
+ get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else {
+ assert(bw % 8 == 0);
+ get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/aom_dsp/arm/dist_wtd_avg_neon.h b/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 0000000..19c9b04
--- /dev/null
+++ b/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+ uint8x8_t wta, uint8x8_t wtb) {
+ uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+ wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+ return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+ uint16x4_t wta, uint16x4_t wtb) {
+ uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+ wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+ return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+ uint8x16_t wta, uint8x16_t wtb) {
+ uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+ uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+ wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+ wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+ uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+ uint16x8_t wta, uint16x8_t wtb) {
+ uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+ uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+ wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+ wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+ uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index a7d66b3..fb4cda7 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -48,8 +48,8 @@
// Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
- const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
- const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
// fdct_round_shift
int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -57,11 +57,13 @@
// s_3 * cospi_8_64 + s_2 * cospi_24_64
// s_3 * cospi_24_64 - s_2 * cospi_8_64
- const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
- const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
- const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
- const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+ const int32x4_t temp3 =
+ vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+ const int32x4_t temp4 =
+ vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
// fdct_round_shift
int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
@@ -69,7 +71,7 @@
// Only transpose the first pass
if (i == 0) {
- transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+ transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
}
*input_0 = out_0;
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 82ce0cd..d0f5922 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -37,7 +37,7 @@
hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
- transpose_s16_4x4d(&a0, &a1, &a2, &a3);
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
@@ -91,7 +91,7 @@
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
- transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
@@ -120,7 +120,7 @@
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
- transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
@@ -196,56 +196,90 @@
/* Bottom right. */
aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+ // Each iteration of the loop operates on entire rows (16 samples each)
+ // because we need to swap the second and third quarters of every row in the
+ // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+ // loop at the end of aom_hadamard_16x16_c.
for (int i = 0; i < 64; i += 16) {
- const int16x8_t a00 = load_tran_low_to_s16q(coeff + 0);
- const int16x8_t a01 = load_tran_low_to_s16q(coeff + 64);
- const int16x8_t a02 = load_tran_low_to_s16q(coeff + 128);
- const int16x8_t a03 = load_tran_low_to_s16q(coeff + 192);
+ const int32x4_t a00 = vld1q_s32(coeff + 0);
+ const int32x4_t a01 = vld1q_s32(coeff + 64);
+ const int32x4_t a02 = vld1q_s32(coeff + 128);
+ const int32x4_t a03 = vld1q_s32(coeff + 192);
- const int16x8_t b00 = vhaddq_s16(a00, a01);
- const int16x8_t b01 = vhsubq_s16(a00, a01);
- const int16x8_t b02 = vhaddq_s16(a02, a03);
- const int16x8_t b03 = vhsubq_s16(a02, a03);
+ const int32x4_t b00 = vhaddq_s32(a00, a01);
+ const int32x4_t b01 = vhsubq_s32(a00, a01);
+ const int32x4_t b02 = vhaddq_s32(a02, a03);
+ const int32x4_t b03 = vhsubq_s32(a02, a03);
- const int16x8_t c00 = vaddq_s16(b00, b02);
- const int16x8_t c01 = vaddq_s16(b01, b03);
- const int16x8_t c02 = vsubq_s16(b00, b02);
- const int16x8_t c03 = vsubq_s16(b01, b03);
+ const int32x4_t c00 = vaddq_s32(b00, b02);
+ const int32x4_t c01 = vaddq_s32(b01, b03);
+ const int32x4_t c02 = vsubq_s32(b00, b02);
+ const int32x4_t c03 = vsubq_s32(b01, b03);
- const int16x8_t a10 = load_tran_low_to_s16q(coeff + 8 + 0);
- const int16x8_t a11 = load_tran_low_to_s16q(coeff + 8 + 64);
- const int16x8_t a12 = load_tran_low_to_s16q(coeff + 8 + 128);
- const int16x8_t a13 = load_tran_low_to_s16q(coeff + 8 + 192);
+ const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+ const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+ const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+ const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
- const int16x8_t b10 = vhaddq_s16(a10, a11);
- const int16x8_t b11 = vhsubq_s16(a10, a11);
- const int16x8_t b12 = vhaddq_s16(a12, a13);
- const int16x8_t b13 = vhsubq_s16(a12, a13);
+ const int32x4_t b10 = vhaddq_s32(a10, a11);
+ const int32x4_t b11 = vhsubq_s32(a10, a11);
+ const int32x4_t b12 = vhaddq_s32(a12, a13);
+ const int32x4_t b13 = vhsubq_s32(a12, a13);
- const int16x8_t c10 = vaddq_s16(b10, b12);
- const int16x8_t c11 = vaddq_s16(b11, b13);
- const int16x8_t c12 = vsubq_s16(b10, b12);
- const int16x8_t c13 = vsubq_s16(b11, b13);
+ const int32x4_t c10 = vaddq_s32(b10, b12);
+ const int32x4_t c11 = vaddq_s32(b11, b13);
+ const int32x4_t c12 = vsubq_s32(b10, b12);
+ const int32x4_t c13 = vsubq_s32(b11, b13);
- store_s16_to_tran_low(coeff + 0 + 0, vget_low_s16(c00));
- store_s16_to_tran_low(coeff + 0 + 4, vget_low_s16(c10));
- store_s16_to_tran_low(coeff + 0 + 8, vget_high_s16(c00));
- store_s16_to_tran_low(coeff + 0 + 12, vget_high_s16(c10));
+ const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+ const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+ const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+ const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
- store_s16_to_tran_low(coeff + 64 + 0, vget_low_s16(c01));
- store_s16_to_tran_low(coeff + 64 + 4, vget_low_s16(c11));
- store_s16_to_tran_low(coeff + 64 + 8, vget_high_s16(c01));
- store_s16_to_tran_low(coeff + 64 + 12, vget_high_s16(c11));
+ const int32x4_t b20 = vhaddq_s32(a20, a21);
+ const int32x4_t b21 = vhsubq_s32(a20, a21);
+ const int32x4_t b22 = vhaddq_s32(a22, a23);
+ const int32x4_t b23 = vhsubq_s32(a22, a23);
- store_s16_to_tran_low(coeff + 128 + 0, vget_low_s16(c02));
- store_s16_to_tran_low(coeff + 128 + 4, vget_low_s16(c12));
- store_s16_to_tran_low(coeff + 128 + 8, vget_high_s16(c02));
- store_s16_to_tran_low(coeff + 128 + 12, vget_high_s16(c12));
+ const int32x4_t c20 = vaddq_s32(b20, b22);
+ const int32x4_t c21 = vaddq_s32(b21, b23);
+ const int32x4_t c22 = vsubq_s32(b20, b22);
+ const int32x4_t c23 = vsubq_s32(b21, b23);
- store_s16_to_tran_low(coeff + 192 + 0, vget_low_s16(c03));
- store_s16_to_tran_low(coeff + 192 + 4, vget_low_s16(c13));
- store_s16_to_tran_low(coeff + 192 + 8, vget_high_s16(c03));
- store_s16_to_tran_low(coeff + 192 + 12, vget_high_s16(c13));
+ const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+ const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+ const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+ const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+ const int32x4_t b30 = vhaddq_s32(a30, a31);
+ const int32x4_t b31 = vhsubq_s32(a30, a31);
+ const int32x4_t b32 = vhaddq_s32(a32, a33);
+ const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+ const int32x4_t c30 = vaddq_s32(b30, b32);
+ const int32x4_t c31 = vaddq_s32(b31, b33);
+ const int32x4_t c32 = vsubq_s32(b30, b32);
+ const int32x4_t c33 = vsubq_s32(b31, b33);
+
+ vst1q_s32(coeff + 0 + 0, c00);
+ vst1q_s32(coeff + 0 + 4, c20);
+ vst1q_s32(coeff + 0 + 8, c10);
+ vst1q_s32(coeff + 0 + 12, c30);
+
+ vst1q_s32(coeff + 64 + 0, c01);
+ vst1q_s32(coeff + 64 + 4, c21);
+ vst1q_s32(coeff + 64 + 8, c11);
+ vst1q_s32(coeff + 64 + 12, c31);
+
+ vst1q_s32(coeff + 128 + 0, c02);
+ vst1q_s32(coeff + 128 + 4, c22);
+ vst1q_s32(coeff + 128 + 8, c12);
+ vst1q_s32(coeff + 128 + 12, c32);
+
+ vst1q_s32(coeff + 192 + 0, c03);
+ vst1q_s32(coeff + 192 + 4, c23);
+ vst1q_s32(coeff + 192 + 8, c13);
+ vst1q_s32(coeff + 192 + 12, c33);
coeff += 16;
}
diff --git a/aom_dsp/arm/highbd_avg_pred_neon.c b/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000..531309b
--- /dev/null
+++ b/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width >= 8) {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src0 + j);
+ const uint16x8_t s1 = vld1q_u16(src1 + j);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(comp_pred + j, blend_u16);
+
+ j += 8;
+ } while (j < width);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+
+ do {
+ const uint16x4_t s0 = vld1_u16(src0);
+ const uint16x4_t s1 = vld1_u16(src1);
+ const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ vst1_u16(comp_pred, blend_u16);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 4;
+ } while (--height != 0);
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ const uint16x4_t avg = dist_wtd_avg_u16x4(
+ r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 0000000..bdd2177
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ const uint16x4_t m0 =
+ vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+ do {
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 0000000..36d763a
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits) \
+ static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8( \
+ uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) { \
+ const uint16x8_t m_inv = \
+ vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); \
+ \
+ uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_low_u16(m), vget_low_u16(a)); \
+ uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_high_u16(m), vget_high_u16(a)); \
+ \
+ blend_u32_lo = \
+ vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); \
+ blend_u32_hi = \
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); \
+ \
+ uint16x4_t blend_u16_lo = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ uint16x4_t blend_u16_hi = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ \
+ uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi); \
+ blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1)); \
+ \
+ return blend_u16; \
+ } \
+ \
+ static INLINE void highbd_##bd##_blend_a64_d16_mask_neon( \
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, \
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, \
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, \
+ int subh) { \
+ const int offset_bits = bd + 2 * FILTER_BITS - round0_bits; \
+ int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + \
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); \
+ int32x4_t offset = \
+ vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS)); \
+ \
+ if ((subw | subh) == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if ((subw & subh) == 1) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); \
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4( \
+ vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0), \
+ vget_high_u8(m1))); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); \
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = \
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if (subw == 1 && subh == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 2 * i); \
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0_2 = \
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+ uint8x8_t m1_3 = \
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_unaligned_u16_4x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } \
+ }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+ uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ (void)conv_params;
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ if (bd == 12) {
+ highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else if (bd == 10) {
+ highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else {
+ highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh);
+ }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h, int subw, int subh, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if ((subw | subh) == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 0000000..ea3d655
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+ uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+ uint16x8_t m = vcombine_u16(m1, m2);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ do {
+ uint16x4_t m0 = vdup_n_u16(0);
+ m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+ uint8x8_t m0_zip =
+ vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+ m0 = vget_low_u16(vmovl_u8(m0_zip));
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 0000000..e25438c
--- /dev/null
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ int32x4_t sum =
+ highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ const int16x8_t s2 = vextq_s16(s0, s1, 1);
+ const int16x8_t s3 = vextq_s16(s0, s1, 2);
+ const int16x8_t s4 = vextq_s16(s0, s1, 3);
+ const int16x4_t s0_lo = vget_low_s16(s0);
+ const int16x4_t s1_lo = vget_low_s16(s2);
+ const int16x4_t s2_lo = vget_low_s16(s3);
+ const int16x4_t s3_lo = vget_low_s16(s4);
+ const int16x4_t s4_lo = vget_high_s16(s0);
+ const int16x4_t s5_lo = vget_high_s16(s2);
+ const int16x4_t s6_lo = vget_high_s16(s3);
+ const int16x4_t s7_lo = vget_high_s16(s4);
+
+ return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+ s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ int32x4_t *sum0, int32x4_t *sum1) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+ const int16x8_t s0_hi,
+ const int16x8_t x_filter_0_7,
+ int32x4_t *sum0,
+ int32x4_t *sum1) {
+ const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+ const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+ const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+ const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+ const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+ const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+ const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+ sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum0, sum1;
+ highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+ int32x4_t sum0;
+ int32x4_t sum1;
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+ &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *x_filter_ptr,
+ int x_step_q4, int w, int h, int bd) {
+ assert(w >= 4 && h >= 4);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0, s1, s2, s3;
+ load_s16_8x2(s, src_stride, &s0, &s2);
+ load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+ uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+ uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ d01 = vminq_u16(d01, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+ s += 2 * src_stride;
+ d += 2 * dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int x_q4 = 0;
+
+ const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+ int16x8_t s0, s2, s4, s6;
+ load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+ src_x += 8;
+
+ do {
+ int16x8_t s1, s3, s5, s7;
+ load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+ uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+ uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+ uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+ uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s1;
+ s2 = s3;
+ s4 = s5;
+ s6 = s7;
+ src_x += 8;
+ d += 8;
+ width -= 8;
+ x_q4 += 8 * x_step_q4;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_y;
+ (void)y_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= SUBPEL_TAPS / 2 - 1;
+ highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, w, h, bd);
+ }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *y_filter_ptr, int w, int h,
+ int bd) {
+ assert(w >= 4 && h >= 4);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x4_t d1 =
+ highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x4_t d2 =
+ highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x4_t d3 =
+ highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ uint16x8_t d23 = vcombine_u16(d2, d3);
+
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+ vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+ vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+ s7, y_filter);
+ uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+ s8, y_filter);
+ uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+ s9, y_filter);
+ uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+ s10, y_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_x;
+ (void)x_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+ highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+ bd);
+ }
+}
diff --git a/aom_dsp/arm/highbd_hadamard_neon.c b/aom_dsp/arm/highbd_hadamard_neon.c
index aad2046..d28617c 100644
--- a/aom_dsp/arm/highbd_hadamard_neon.c
+++ b/aom_dsp/arm/highbd_hadamard_neon.c
@@ -109,7 +109,7 @@
// For the first pass we can stay in 16-bit elements (4095*8 = 32760).
hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
// For the second pass we need to widen to 32-bit elements, so we're
// processing 4 columns at a time.
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 63f53c3..366ca3f 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -15,6 +15,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/intrapred_common.h"
// -----------------------------------------------------------------------------
@@ -191,7 +192,7 @@
uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \
uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \
uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \
- int sum = horizontal_add_and_broadcast_long_u16x8(sum_vec)[0]; \
+ int sum = horizontal_add_u16x8(sum_vec); \
int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \
}
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 2b5128e..77727b7 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -298,7 +298,7 @@
uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
vld1_u16(dst_q1) };
- transpose_u16_4x4(src);
+ transpose_array_inplace_u16_4x4(src);
// Adjust thresholds to bitdepth.
const int outer_thresh = *blimit << (bd - 8);
@@ -344,7 +344,7 @@
vget_high_u16(p0q0_output),
vget_high_u16(p1q1_output),
};
- transpose_u16_4x4(output);
+ transpose_array_inplace_u16_4x4(output);
vst1_u16(dst_p1, output[0]);
vst1_u16(dst_p0, output[1]);
@@ -386,7 +386,7 @@
// p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
// ^^^^^^ ^^^^^^
// Should dual issue with the left shift.
- const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
sum = vaddq_u16(sum, outer_sum);
@@ -401,7 +401,7 @@
// p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
// ^^^^^^^^
sum = vsubq_u16(sum, p2q2_double);
- const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
*p0q0_output = vrshrq_n_u16(sum, 3);
@@ -505,7 +505,7 @@
// and src_raw[3] after transpose.
uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
vld1q_u16(dst_2), vld1q_u16(dst_3) };
- transpose_u16_4x8q(src_raw);
+ transpose_array_inplace_u16_4x8(src_raw);
// p2, p1, p0, q0, q1, q2
const uint16x4_t src[6] = {
vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
@@ -574,7 +574,7 @@
vget_high_u16(p0q0_output),
vget_high_u16(p1q1_output),
};
- transpose_u16_4x4(output);
+ transpose_array_inplace_u16_4x4(output);
// dst_n starts at p2, so adjust to p1.
vst1_u16(dst_0 + 1, output[0]);
@@ -626,7 +626,7 @@
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
// ^^^^^^
- const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
sum = vaddq_u16(sum, q0p0);
*p2q2_output = vrshrq_n_u16(sum, 3);
@@ -635,7 +635,7 @@
// p1 = p2 - p3 - p2 + p1 + q1
// q1 = q2 - q3 - q2 + q0 + p1
sum = vsubq_u16(sum, p23q23);
- const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
*p1q1_output = vrshrq_n_u16(sum, 3);
@@ -644,7 +644,7 @@
// p0 = p1 - p3 - p1 + p0 + q2
// q0 = q1 - q3 - q1 + q0 + p2
sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
- const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
*p0q0_output = vrshrq_n_u16(sum, 3);
@@ -827,7 +827,7 @@
uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
// After transpose, |output| will contain rows of the form:
// p0 p1 p2 p3 q0 q1 q2 q3
- transpose_u16_4x8q(output);
+ transpose_array_inplace_u16_4x8(output);
// Reverse p values to produce original order:
// p3 p2 p1 p0 q0 q1 q2 q3
@@ -883,7 +883,7 @@
// ^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^
- const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
sum = vaddq_u16(sum, q0p0);
*p5q5_output = vrshrq_n_u16(sum, 4);
@@ -892,7 +892,7 @@
// p4 = p5 - (2 * p6) + p3 + q1
// q4 = q5 - (2 * q6) + q3 + p1
sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
- const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
*p4q4_output = vrshrq_n_u16(sum, 4);
@@ -901,7 +901,7 @@
// p3 = p4 - p6 - p5 + p2 + q2
// q3 = q4 - q6 - q5 + q2 + p2
sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
- const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
*p3q3_output = vrshrq_n_u16(sum, 4);
@@ -910,7 +910,7 @@
// p2 = p3 - p6 - p4 + p1 + q3
// q2 = q3 - q6 - q4 + q1 + p3
sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
- const uint16x8_t q3p3 = transpose64_u16q(p3q3);
+ const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
*p2q2_output = vrshrq_n_u16(sum, 4);
@@ -919,7 +919,7 @@
// p1 = p2 - p6 - p3 + p0 + q4
// q1 = q2 - q6 - q3 + q0 + p4
sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
- const uint16x8_t q4p4 = transpose64_u16q(p4q4);
+ const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
*p1q1_output = vrshrq_n_u16(sum, 4);
@@ -928,7 +928,7 @@
// p0 = p1 - p6 - p2 + q0 + q5
// q0 = q1 - q6 - q2 + p0 + p5
sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
- const uint16x8_t q5p5 = transpose64_u16q(p5q5);
+ const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
*p0q0_output = vrshrq_n_u16(sum, 4);
@@ -1118,14 +1118,14 @@
uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
vld1q_u16(dst_3) };
// p7 will be the low half of src_p[0]. Not used until the end.
- transpose_u16_4x8q(src_p);
+ transpose_array_inplace_u16_4x8(src_p);
// Low halves: q0 q1 q2 q3
// High halves: q4 q5 q6 q7
uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
// q7 will be the high half of src_q[3]. Not used until the end.
- transpose_u16_4x8q(src_q);
+ transpose_array_inplace_u16_4x8(src_q);
// Adjust thresholds to bitdepth.
const int outer_thresh = *blimit << (bd - 8);
@@ -1238,10 +1238,10 @@
const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
- transpose_u16_4x8q(output_p);
+ transpose_array_inplace_u16_4x8(output_p);
uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
- transpose_u16_4x8q(output_q);
+ transpose_array_inplace_u16_4x8(output_q);
// Reverse p values to produce original order:
// p3 p2 p1 p0 q0 q1 q2 q3
diff --git a/aom_dsp/arm/highbd_masked_sad_neon.c b/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 0000000..9262d81
--- /dev/null
+++ b/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ const uint16x8_t s0 = vld1q_u16(src);
+ const uint16x8_t a0 = vld1q_u16(a);
+ const uint16x8_t b0 = vld1q_u16(b);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+ return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+ return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+ return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+ sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+ sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+ sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+ height -= 4;
+ } while (height != 0);
+
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+ sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+ return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad = vdupq_n_u16(0);
+ for (int h = 0; h < 4; ++h) {
+ sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 8; ++h) {
+ sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 8;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 16; ++h) {
+ sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 16;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 8);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 64 elements in the
+ // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ uint16x4_t sad = vdup_n_u16(0);
+ do {
+ uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+ uint16x4_t a0 = load_unaligned_u16_4x1(a);
+ uint16x4_t b0 = load_unaligned_u16_4x1(b);
+ uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+ sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_sad_neon.c b/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 0000000..28699e6
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+ const int32_t *mask,
+ const int32_t *wsrc,
+ uint32x4_t *sum) {
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+ int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+ *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height / 2;
+ do {
+ uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += 2 * ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t r = vld1q_u16(ref_ptr);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int width, int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+ uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+ wsrc += 16;
+ mask += 16;
+ i += 16;
+ } while (i < width);
+
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+ uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+ uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+ highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+ highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+ wsrc += 32;
+ mask += 32;
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_neon( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+ }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_variance_neon.c b/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 0000000..d592246
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ uint32x4_t *sse,
+ int32x4_t *sum) {
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+ int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+ int32x4_t mask_lo = vld1q_s32(&mask[0]);
+ int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+ int32x4_t diff_hi =
+ vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+ diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+ diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+ // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+ // This difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+ diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+ int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+ int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+ *sum = vaddq_s32(*sum, round_lo);
+ *sum = vaddq_s32(*sum, round_hi);
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+ vreinterpretq_u32_s32(round_lo));
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+ vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+ int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ do {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ int j = 0;
+
+ do {
+ int i = 0;
+
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+ &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ j++;
+ } while (j < h_tmp);
+
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+ h -= h_tmp;
+ } while (h != 0);
+
+ *sse = horizontal_add_u64x2(sse_u64);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = vld1q_u16(pre);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += pre_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ assert(h % 2 == 0);
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += 2 * pre_stride;
+ wsrc += 8;
+ mask += 8;
+ h -= 2;
+ } while (h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+ &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+ &sse64, &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index 77a7aac..6149c9f 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -11,14 +11,11 @@
#include <arm_neon.h>
#include <assert.h>
+#include <string.h>
#include "config/aom_config.h"
#include "aom_dsp/quantize.h"
-#include "aom_dsp/arm/mem_neon.h"
-
-#include "av1/common/quant_common.h"
-#include "av1/encoder/av1_quantize.h"
static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
#if AOM_ARCH_AARCH64
@@ -83,6 +80,7 @@
return vmaxq_s16(v_eobmax, v_nz_iscan);
}
+#if !CONFIG_REALTIME_ONLY
static INLINE void get_min_max_lane_eob(const int16_t *iscan,
int16x8_t *v_eobmin,
int16x8_t *v_eobmax, uint16x8_t v_mask,
@@ -91,13 +89,14 @@
const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
#if SKIP_EOB_FACTOR_ADJUST
const int16x8_t v_nz_iscan_min =
- vbslq_s16(v_mask, v_iscan, vdupq_n_s16(n_coeffs));
+ vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
*v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
#else
(void)v_eobmin;
#endif
*v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
}
+#endif // !CONFIG_REALTIME_ONLY
static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
#if AOM_ARCH_AARCH64
@@ -117,6 +116,7 @@
#endif
}
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
#if AOM_ARCH_AARCH64
return (uint16_t)vminvq_s16(v_eobmin);
@@ -134,6 +134,7 @@
return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
#endif
}
+#endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
static void highbd_quantize_b_neon(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
@@ -298,7 +299,7 @@
int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
uint16x4_t v_mask_lo, v_mask_hi;
int16x8_t v_eobmax = vdupq_n_s16(-1);
- int16x8_t v_eobmin = vdupq_n_s16(n_coeffs);
+ int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
assert(n_coeffs > 8);
// Pre-scan pass
diff --git a/aom_dsp/arm/highbd_sad4d_neon.c b/aom_dsp/arm/highbd_sad4d_neon.c
deleted file mode 100644
index f2fda36..0000000
--- a/aom_dsp/arm/highbd_sad4d_neon.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/sum_neon.h"
-
-static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int h) {
- const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
- const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
- const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
- const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
- const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
- uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
-
- int i = 0;
- do {
- uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
- uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
- uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
- uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
- uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
-
- sum[0] = vabal_u16(sum[0], s, r0);
- sum[1] = vabal_u16(sum[1], s, r1);
- sum[2] = vabal_u16(sum[2], s, r2);
- sum[3] = vabal_u16(sum[3], s, r3);
-
- } while (++i < h);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int h) {
- const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
- const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
- const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
- const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
- const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
- uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
- vdupq_n_u16(0) };
- uint32x4_t sum_u32[4];
-
- int i = 0;
- do {
- uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
-
- sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
- sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
- sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
- sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
-
- } while (++i < h);
-
- sum_u32[0] = vpaddlq_u16(sum[0]);
- sum_u32[1] = vpaddlq_u16(sum[1]);
- sum_u32[2] = vpaddlq_u16(sum[2]);
- sum_u32[3] = vpaddlq_u16(sum[3]);
- vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
-}
-
-static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
- uint32x4_t *const sad_sum) {
- uint16x8_t abs_diff = vabdq_u16(src, ref);
- *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
-}
-
-static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int h) {
- const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
- const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
- const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
- const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
- const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
- uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
-
- int i = 0;
- do {
- uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
- sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
- sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
- sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
- sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
-
- } while (++i < h);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int h) {
- const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
- const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
- const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
- const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
- const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
- uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum[4];
-
- int i = 0;
- do {
- uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
- sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
- sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
- sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
- sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
-
- uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
- sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
- sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
- sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
- sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
-
- } while (++i < h);
-
- sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
- sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
- sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
- sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int w, int h) {
- const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
- const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
- const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
- const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
- const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
- uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum[4];
-
- int i = 0;
- do {
- int j = 0;
- do {
- uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
- sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
- sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
- sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
- sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
-
- uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
- sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
- sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
- sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
- sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
-
- uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
- sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
- &sum_lo[0]);
- sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
- &sum_lo[1]);
- sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
- &sum_lo[2]);
- sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
- &sum_lo[3]);
-
- uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
- sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
- &sum_hi[0]);
- sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
- &sum_hi[1]);
- sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
- &sum_hi[2]);
- sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
- &sum_hi[3]);
-
- j += 32;
- } while (j < w);
-
- } while (++i < h);
-
- sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
- sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
- sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
- sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad128xhx4d_large_neon(
- const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4], int h) {
- highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
- 128, h);
-}
-
-static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int h) {
- highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
- h);
-}
-
-static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *const ref_ptr[4],
- int ref_stride, uint32_t res[4],
- int h) {
- highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
- h);
-}
-
-#define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \
- void aom_highbd_sad##w##x##h##x4d_neon( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
- int ref_stride, uint32_t sad_array[4]) { \
- highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \
- sad_array, (h)); \
- }
-
-#define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \
- void aom_highbd_sad##w##x##h##x4d_neon( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
- int ref_stride, uint32_t sad_array[4]) { \
- highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \
- sad_array, (h)); \
- }
-
-HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
-HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
-
-HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
-HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
-HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
-
-HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
-
-HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
-HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
-HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
-
-HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
-HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
-HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
-
-HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
-HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
-
-#if !CONFIG_REALTIME_ONLY
-HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
-
-HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
-
-HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
-
-HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
-
-HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
-#endif // !CONFIG_REALTIME_ONLY
-
-#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \
- void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
- int ref_stride, uint32_t sad_array[4]) { \
- highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \
- 2 * ref_stride, sad_array, ((h) >> 1)); \
- sad_array[0] <<= 1; \
- sad_array[1] <<= 1; \
- sad_array[2] <<= 1; \
- sad_array[3] <<= 1; \
- }
-
-#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \
- void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
- int ref_stride, uint32_t sad_array[4]) { \
- highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \
- 2 * ref_stride, sad_array, ((h) >> 1)); \
- sad_array[0] <<= 1; \
- sad_array[1] <<= 1; \
- sad_array[2] <<= 1; \
- sad_array[3] <<= 1; \
- }
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
-
-#if !CONFIG_REALTIME_ONLY
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
-#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index 919eb55..d51f639 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -61,6 +61,7 @@
return horizontal_add_u16x8(sum);
}
+#if !CONFIG_REALTIME_ONLY
static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -82,6 +83,7 @@
return horizontal_add_u32x4(sum_u32);
}
+#endif // !CONFIG_REALTIME_ONLY
static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
int src_stride,
@@ -283,3 +285,225 @@
HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ uint16x4_t p = vld1_u16(pred16_ptr);
+
+ uint16x4_t avg = vrhadd_u16(r, p);
+ sum = vabal_u16(sum, s, avg);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 4;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t p = vld1q_u16(pred16_ptr);
+
+ uint16x8_t avg = vrhaddq_u16(r, p);
+ uint16x8_t diff = vabdq_u16(s, avg);
+ sum = vpadalq_u16(sum, diff);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1, p0, p1;
+ uint16x8_t avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ p0 = vld1q_u16(pred16_ptr);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ p1 = vld1q_u16(pred16_ptr + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ p0 = vld1q_u16(pred16_ptr + j);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ p1 = vld1q_u16(pred16_ptr + j + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ p2 = vld1q_u16(pred16_ptr + j + 16);
+ avg2 = vrhaddq_u16(r2, p2);
+ diff2 = vabdq_u16(s2, avg2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ p3 = vld1q_u16(pred16_ptr + j + 24);
+ avg3 = vrhaddq_u16(r3, p3);
+ diff3 = vabdq_u16(s3, avg3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += w;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h) \
+ uint32_t aom_highbd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sadxd_neon.c b/aom_dsp/arm/highbd_sadxd_neon.c
new file mode 100644
index 0000000..85ca673
--- /dev/null
+++ b/aom_dsp/arm/highbd_sadxd_neon.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+ uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+ sum[3] = vabal_u16(sum[3], s, r3);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+ sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+ } while (++i < h);
+
+ sum_u32[0] = vpaddlq_u16(sum[0]);
+ sum_u32[1] = vpaddlq_u16(sum[1]);
+ sum_u32[2] = vpaddlq_u16(sum[2]);
+ sum_u32[3] = vpaddlq_u16(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+ uint32x4_t *const sad_sum) {
+ uint16x8_t abs_diff = vabdq_u16(src, ref);
+ *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+ sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+ sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+ sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+ &sum_lo[3]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+ &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad128xhx4d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \
+ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \
+ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+ res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+ res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum[3];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sse_neon.c b/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000..184e9f9
--- /dev/null
+++ b/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+ const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[16];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[4];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2];
+ highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ // Peel the first loop iteration.
+ uint16x4_t s = vld1_u16(src);
+ uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t abs_diff = vabd_u16(s, r);
+ uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ s = vld1_u16(src);
+ r = vld1_u16(ref);
+
+ abs_diff = vabd_u16(s, r);
+ sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int width, int height) {
+ // { 0, 1, 2, 3, 4, 5, 6, 7 }
+ uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+ uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+ uint64_t sse = 0;
+
+ do {
+ int w = width;
+ int offset = 0;
+
+ do {
+ uint16x8_t s = vld1q_u16(src + offset);
+ uint16x8_t r = vld1q_u16(ref + offset);
+
+ if (w < 8) {
+ // Mask out-of-range elements.
+ s = vandq_u16(s, remainder_mask);
+ r = vandq_u16(r, remainder_mask);
+ }
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ offset += 8;
+ w -= 8;
+ } while (w > 0);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride, int width,
+ int height) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ switch (width) {
+ case 4:
+ return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+ default:
+ return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/aom_dsp/arm/highbd_subpel_variance_neon.c b/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000..bdbbf70
--- /dev/null
+++ b/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, blend);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, blend);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
+ src_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 8, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 128, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+ uint16x8_t p = vld1q_u16(second_pred);
+ avg = vrhaddq_u16(avg, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = vrhaddq_u16(s, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp, source_stride, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp, source_stride, source_stride, h, yoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp0, source_stride, 1, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp0, source_stride, 1, h, xoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \
+ h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ if (xoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ if (yoffset == 0) { \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \
+ w, h, src, src_stride, msk, msk_stride, \
+ invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \
+ w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \
+ src_stride, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ if (yoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ pre, pre_stride, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \
+ pre_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width,
+ int dst_height,
+ const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+ const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+ vst1_u16(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp, source_stride, source_stride, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c
index 948f2f7..e54fc18 100644
--- a/aom_dsp/arm/highbd_variance_neon.c
+++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -15,10 +15,10 @@
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/variance.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
// Process a block of width 4 two rows at a time.
static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
@@ -412,67 +412,6 @@
return *sse;
}
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
- int src_stride,
- const uint16_t *ref_ptr,
- int ref_stride, int h,
- unsigned int *sse) {
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h / 2;
- do {
- uint16x8_t s0 = vld1q_u16(src_ptr);
- src_ptr += src_stride;
- uint16x8_t s1 = vld1q_u16(src_ptr);
- src_ptr += src_stride;
- uint16x8_t r0 = vld1q_u16(ref_ptr);
- ref_ptr += ref_stride;
- uint16x8_t r1 = vld1q_u16(ref_ptr);
- ref_ptr += ref_stride;
-
- uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
- uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
- uint8x16_t diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, diff, diff);
- } while (--i != 0);
-
- *sse = horizontal_add_u32x4(sse_u32);
- return *sse;
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
- int src_stride,
- const uint16_t *ref_ptr,
- int ref_stride, int h,
- unsigned int *sse) {
- uint32x4_t sse_u32 = vdupq_n_u32(0);
-
- int i = h;
- do {
- uint16x8_t s0 = vld1q_u16(src_ptr);
- uint16x8_t s1 = vld1q_u16(src_ptr + 8);
- uint16x8_t r0 = vld1q_u16(ref_ptr);
- uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
-
- uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
- uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
- uint8x16_t diff = vabdq_u8(s, r);
- sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- *sse = horizontal_add_u32x4(sse_u32);
- return *sse;
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
int src_stride,
const uint16_t *ref_ptr,
@@ -491,8 +430,6 @@
sse);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
#define HIGHBD_MSE_WXH_NEON(w, h) \
uint32_t aom_highbd_8_mse##w##x##h##_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
@@ -529,3 +466,55 @@
HIGHBD_MSE_WXH_NEON(8, 8)
#undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+ uint16x8_t s1, uint16x8_t d0,
+ uint16x8_t d1) {
+ uint16x8_t e0 = vabdq_u16(s0, d0);
+ uint16x8_t e1 = vabdq_u16(s1, d1);
+
+ uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+ mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+ mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+ mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+ return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+ uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else { // w == 4
+ do {
+ uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+ uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return horizontal_add_u64x2(sum);
+}
diff --git a/aom_dsp/arm/highbd_variance_neon_dotprod.c b/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000..d56ae97
--- /dev/null
+++ b/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h / 2;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t s1 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+ uint16x8_t r1 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+ sse); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 2161378..41f070e 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -24,7 +24,7 @@
// DC 4x4
static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
- const uint8x8_t a = load_u8_4x1_lane0(in);
+ const uint8x8_t a = load_u8_4x1(in);
const uint16x4_t p0 = vpaddl_u8(a);
const uint16x4_t p1 = vpadd_u16(p0, p0);
return vcombine_u16(p1, vdup_n_u16(0));
@@ -354,7 +354,7 @@
void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x8_t a = load_u8_4x1_lane0(above);
+ uint8x8_t a = load_u8_4x1(above);
uint8x8_t l = vld1_u8(left);
uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
@@ -364,7 +364,7 @@
void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x8_t a = vld1_u8(above);
- uint8x8_t l = load_u8_4x1_lane0(left);
+ uint8x8_t l = load_u8_4x1(left);
uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
@@ -372,7 +372,7 @@
void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x8_t a = load_u8_4x1_lane0(above);
+ uint8x8_t a = load_u8_4x1(above);
uint8x16_t l = vld1q_u8(left);
uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
uint32_t sum = horizontal_add_u16x8(sum_al);
@@ -383,7 +383,7 @@
void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x16_t a = vld1q_u8(above);
- uint8x8_t l = load_u8_4x1_lane0(left);
+ uint8x8_t l = load_u8_4x1(left);
uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
uint32_t sum = horizontal_add_u16x8(sum_al);
uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
@@ -620,7 +620,7 @@
void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
- v_store_4xh(dst, stride, 4, load_u8_4x1_lane0(above));
+ v_store_4xh(dst, stride, 4, load_u8_4x1(above));
}
void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -646,13 +646,13 @@
void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
- v_store_4xh(dst, stride, 8, load_u8_4x1_lane0(above));
+ v_store_4xh(dst, stride, 8, load_u8_4x1(above));
}
void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
- v_store_4xh(dst, stride, 16, load_u8_4x1_lane0(above));
+ v_store_4xh(dst, stride, 16, load_u8_4x1(above));
}
void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -856,7 +856,7 @@
void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t d0 = load_u8_4x1_lane0(left);
+ const uint8x8_t d0 = load_u8_4x1(left);
(void)above;
store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0);
store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0);
@@ -907,7 +907,7 @@
void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t d0 = load_u8_4x1_lane0(left);
+ const uint8x8_t d0 = load_u8_4x1(left);
(void)above;
vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
@@ -936,7 +936,7 @@
void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t d0 = load_u8_4x1_lane0(left);
+ const uint8x8_t d0 = load_u8_4x1(left);
(void)above;
vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
@@ -1594,8 +1594,10 @@
base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
#if AOM_ARCH_AARCH64
- uint8x8_t left_idx0 = vreinterpret_u8_s16(base_y_c64 + 2); // [0, 16]
- uint8x8_t left_idx1 = vreinterpret_u8_s16(base_y_c64 + 3); // [1, 17]
+ uint8x8_t left_idx0 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16]
+ uint8x8_t left_idx1 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17]
uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8);
uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8);
@@ -1777,8 +1779,10 @@
base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
#if AOM_ARCH_AARCH64
- uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c128 + 2); // [0, 33]
- uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c128 + 3); // [1, 34]
+ uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33]
+ uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34]
uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
@@ -2025,8 +2029,10 @@
#if AOM_ARCH_AARCH64
// Values in left_idx{0,1} range from 0 through 63 inclusive.
- uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c256.val[0] + 1);
- uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c256.val[1] + 1);
+ uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c256.val[0], vdupq_n_s16(1)));
+ uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+ vaddq_s16(base_y_c256.val[1], vdupq_n_s16(1)));
uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
@@ -3168,12 +3174,10 @@
const uint8_t bottom_left = left_column[height - 1];
const uint8_t *const weights_y = smooth_weights + height - 4;
- uint8x8_t UNINITIALIZED_IS_SAFE(top_v);
- load_u8_4x1(top_row, &top_v, 0);
+ uint8x8_t top_v = load_u8_4x1(top_row);
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
- uint8x8_t UNINITIALIZED_IS_SAFE(weights_x_v);
- load_u8_4x1(smooth_weights, &weights_x_v, 0);
+ uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
@@ -3403,9 +3407,9 @@
const uint8_t bottom_left = left_column[height - 1]; \
const uint8_t *const weights_y = smooth_weights + height - 4; \
\
- uint8x8_t UNINITIALIZED_IS_SAFE(top_v); \
+ uint8x8_t top_v; \
if ((W) == 4) { \
- load_u8_4x1(top_row, &top_v, 0); \
+ top_v = load_u8_4x1(top_row); \
} else { /* width == 8 */ \
top_v = vld1_u8(top_row); \
} \
@@ -3717,9 +3721,9 @@
int width, int height) {
const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
- uint8x8_t UNINITIALIZED_IS_SAFE(top);
+ uint8x8_t top;
if (width == 4) {
- load_u8_4x1(top_row, &top, 0);
+ top = load_u8_4x1(top_row);
} else { // width == 8
top = vld1_u8(top_row);
}
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 8fc7ccb..0e683a7 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -634,13 +634,13 @@
p6p2 = vget_low_u8(row1);
p5p1 = vget_low_u8(row2);
p4p0 = vget_low_u8(row3);
- transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
q0q4 = vget_high_u8(row0);
q1q5 = vget_high_u8(row1);
q2q6 = vget_high_u8(row2);
q3qy = vget_high_u8(row3);
- transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
@@ -679,13 +679,13 @@
q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
- transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
- transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
row0 = vcombine_u8(pxp3, q0q4);
row1 = vcombine_u8(p6p2, q1q5);
@@ -725,7 +725,7 @@
// row3: p3 p2 p1 p0 | q0 q1 q2 q3
load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
- transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
@@ -750,7 +750,7 @@
p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
- transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
}
@@ -784,7 +784,7 @@
// row3: px p2 p1 p0 | q0 q1 q2 qy
load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
- transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
@@ -809,7 +809,7 @@
p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
- transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
}
@@ -834,7 +834,7 @@
const uint8_t *limit, const uint8_t *thresh) {
uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
uint32x2_t pq_rev;
- uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), UNINITIALIZED_IS_SAFE(q0q1);
+ uint8x8_t p1p0, q0q1;
uint8x8_t p0q0, p1q1;
// row0: p1 p0 | q0 q1
@@ -843,7 +843,7 @@
// row3: p1 p0 | q0 q1
load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
- transpose_u8_4x4(&p1p0, &q0q1);
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
@@ -860,7 +860,7 @@
p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
- transpose_u8_4x4(&p1p0, &q0q1);
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
store_unaligned_u8_4x1(src - 2, p1p0, 0);
store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
@@ -886,25 +886,13 @@
void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1),
- UNINITIALIZED_IS_SAFE(p2q2), UNINITIALIZED_IS_SAFE(p3q3),
- UNINITIALIZED_IS_SAFE(p4q4), UNINITIALIZED_IS_SAFE(p5q5),
- UNINITIALIZED_IS_SAFE(p6q6);
-
- load_u8_4x1(src - 7 * stride, &p6q6, 0);
- load_u8_4x1(src - 6 * stride, &p5q5, 0);
- load_u8_4x1(src - 5 * stride, &p4q4, 0);
- load_u8_4x1(src - 4 * stride, &p3q3, 0);
- load_u8_4x1(src - 3 * stride, &p2q2, 0);
- load_u8_4x1(src - 2 * stride, &p1q1, 0);
- load_u8_4x1(src - 1 * stride, &p0q0, 0);
- load_u8_4x1(src + 0 * stride, &p0q0, 1);
- load_u8_4x1(src + 1 * stride, &p1q1, 1);
- load_u8_4x1(src + 2 * stride, &p2q2, 1);
- load_u8_4x1(src + 3 * stride, &p3q3, 1);
- load_u8_4x1(src + 4 * stride, &p4q4, 1);
- load_u8_4x1(src + 5 * stride, &p5q5, 1);
- load_u8_4x1(src + 6 * stride, &p6q6, 1);
+ uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+ uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+ uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+ uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+ uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
*thresh);
@@ -1036,12 +1024,8 @@
void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1);
-
- load_u8_4x1(src - 2 * stride, &p1q1, 0);
- load_u8_4x1(src - 1 * stride, &p0q0, 0);
- load_u8_4x1(src + 0 * stride, &p0q0, 1);
- load_u8_4x1(src + 1 * stride, &p1q1, 1);
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
index 98daeda..8f65b80 100644
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ b/aom_dsp/arm/masked_sad4d_neon.c
@@ -516,19 +516,18 @@
vst1q_u32(res, horizontal_add_4d_u16x8(sum));
}
-#define MASKED_SAD4D_WXH_NEON(w, h) \
- void aom_masked_sad##w##x##h##x4d_neon( \
- const uint8_t *src, int src_stride, const uint8_t *ref[4], \
- int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
- int msk_stride, int invert_mask, uint32_t res[4]) { \
- if (invert_mask) { \
- return masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
- second_pred, msk, msk_stride, res, \
- h); \
- } else { \
- return masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
- second_pred, msk, msk_stride, res, h); \
- } \
+#define MASKED_SAD4D_WXH_NEON(w, h) \
+ void aom_masked_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int invert_mask, uint32_t res[4]) { \
+ if (invert_mask) { \
+ masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
+ second_pred, msk, msk_stride, res, h); \
+ } else { \
+ masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, res, h); \
+ } \
}
MASKED_SAD4D_WXH_NEON(4, 8)
diff --git a/aom_dsp/arm/masked_sad_neon.c b/aom_dsp/arm/masked_sad_neon.c
index 340df05..9d26310 100644
--- a/aom_dsp/arm/masked_sad_neon.c
+++ b/aom_dsp/arm/masked_sad_neon.c
@@ -15,9 +15,10 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/blend.h"
-#include "mem_neon.h"
-#include "sum_neon.h"
static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
const uint8_t *src,
@@ -29,15 +30,7 @@
uint8x16_t b0 = vld1q_u8(b);
uint8x16_t s0 = vld1q_u8(src);
- uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
- uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
- uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
- blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
- blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
-
- uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
- uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
- uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
}
@@ -164,10 +157,7 @@
uint8x8_t b0 = vld1_u8(b);
uint8x8_t s0 = vld1_u8(src);
- uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
- uint16x8_t blend_u16 = vmull_u8(m0, a0);
- blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
@@ -199,10 +189,7 @@
uint8x8_t b0 = load_unaligned_u8(b, b_stride);
uint8x8_t s0 = load_unaligned_u8(src, src_stride);
- uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
- uint16x8_t blend_u16 = vmull_u8(m0, a0);
- blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 16d44c5..d1ac648 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -43,6 +43,11 @@
return res;
}
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+ uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+ return res;
+}
+
static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
@@ -85,18 +90,31 @@
return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
}
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define load_u8_4x1(s, s0, lane) \
- do { \
- *(s0) = vreinterpret_u8_u32( \
- vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
- } while (0)
-
// Load four bytes into the low half of a uint8x8_t, zero the upper half.
-static INLINE uint8x8_t load_u8_4x1_lane0(const uint8_t *p) {
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
uint8x8_t ret = vdup_n_u8(0);
- load_u8_4x1(p, &ret, 0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ p += stride;
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+ return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+ uint16x4_t ret = vdup_n_u16(0);
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+ p += stride;
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
return ret;
}
@@ -214,6 +232,38 @@
s += p;
}
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7,
+ int16x4_t *const s8, int16x4_t *const s9,
+ int16x4_t *const s10, int16x4_t *const s11) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+ s += p;
+ *s8 = vld1_s16(s);
+ s += p;
+ *s9 = vld1_s16(s);
+ s += p;
+ *s10 = vld1_s16(s);
+ s += p;
+ *s11 = vld1_s16(s);
+}
+
static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
int16x4_t *const s0, int16x4_t *const s1,
int16x4_t *const s2, int16x4_t *const s3,
@@ -316,6 +366,23 @@
*s6 = vld1_s16(s);
}
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+}
+
static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
int16x4_t *const s0, int16x4_t *const s1,
int16x4_t *const s2, int16x4_t *const s3,
@@ -592,6 +659,33 @@
*s10 = vld1_u8(s);
}
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+}
+
static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1,
int16x8_t *const s2, int16x8_t *const s3,
@@ -622,6 +716,38 @@
*s10 = vld1q_s16(s);
}
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9,
+ int16x8_t *const s10, int16x8_t *const s11) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+ s += p;
+ *s10 = vld1q_s16(s);
+ s += p;
+ *s11 = vld1q_s16(s);
+}
+
static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
uint16x8_t *const s0, uint16x8_t *const s1,
uint16x8_t *const s2, uint16x8_t *const s3,
@@ -714,6 +840,23 @@
*s6 = vld1q_s16(s);
}
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+}
+
static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1,
int16x8_t *const s2, int16x8_t *const s3,
@@ -793,6 +936,24 @@
return vreinterpret_u8_u32(a_u32);
}
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(a);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+ uint16_t a;
+ uint16x4_t a_u32;
+
+ memcpy(&a, buf, 2);
+ a_u32 = vdup_n_u16(a);
+ return vreinterpret_u8_u16(a_u32);
+}
+
static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
uint32_t a;
uint32x2_t a_u32;
@@ -844,6 +1005,20 @@
memcpy(dst, &a, 2); \
} while (0)
+#define store_unaligned_u16_2x1(dst, src, lane) \
+ do { \
+ uint32_t a; \
+ a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_unaligned_u16_4x1(dst, src, lane) \
+ do { \
+ uint64_t a; \
+ a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+ memcpy(dst, &a, 8); \
+ } while (0)
+
static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
uint8x16_t *const s0, uint8x16_t *const s1,
uint8x16_t *const s2, uint8x16_t *const s3,
@@ -917,6 +1092,27 @@
*s7 = vld1q_u16(s + 8);
}
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+ int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+ uint64_t a;
+ uint64x1_t a_u64 = vdup_n_u64(0);
+ memcpy(&a, buf, 8);
+ a_u64 = vset_lane_u64(a, a_u64, 0);
+ return vreinterpret_u16_u64(a_u64);
+}
+
static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
uint32_t stride) {
uint64_t a;
@@ -1004,4 +1200,32 @@
vst1q_s32(buf, v0);
}
+static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_unaligned_u8_2x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u8_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_unaligned_u8_4x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u8_4x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x4_t src) {
+ store_unaligned_u16_2x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u16_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x8_t src) {
+ store_unaligned_u16_4x1(dst, src, 0);
+ dst += dst_stride;
+ store_unaligned_u16_4x1(dst, src, 1);
+}
+
#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 60efef8..46a1666 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -15,93 +15,10 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int w, int h) {
- // Only two accumulators are required for optimal instruction throughput of
- // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h;
- do {
- int j = 0;
- do {
- uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr + j);
- r0 = vld1q_u8(ref_ptr + j);
- diff0 = vabdq_u8(s0, r0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- s1 = vld1q_u8(src_ptr + j + 16);
- r1 = vld1q_u8(ref_ptr + j + 16);
- diff1 = vabdq_u8(s1, r1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- j += 32;
- } while (j < w);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
-}
-
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
-}
-
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
-}
-
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int h) {
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h / 2;
- do {
- uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr);
- r0 = vld1q_u8(ref_ptr);
- diff0 = vabdq_u8(s0, r0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
-
- s1 = vld1q_u8(src_ptr);
- r1 = vld1q_u8(ref_ptr);
- diff1 = vabdq_u8(s1, r1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
int h) {
@@ -220,28 +137,25 @@
static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
int h) {
- uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
int i = h;
do {
uint8x16_t s0 = vld1q_u8(src_ptr);
uint8x16_t r0 = vld1q_u8(ref_ptr);
uint8x16_t diff0 = vabdq_u8(s0, r0);
- uint16x8_t sum0 = vpaddlq_u8(diff0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
uint8x16_t s1 = vld1q_u8(src_ptr + 16);
uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
uint8x16_t diff1 = vabdq_u8(s1, r1);
- uint16x8_t sum1 = vpaddlq_u8(diff1);
-
- sum = vpadalq_u16(sum, sum0);
- sum = vpadalq_u16(sum, sum1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
src_ptr += src_stride;
ref_ptr += ref_stride;
} while (--i != 0);
- return horizontal_add_u32x4(sum);
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
}
static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -264,8 +178,6 @@
return horizontal_add_u16x8(sum);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
int h) {
@@ -384,114 +296,6 @@
#undef SAD_SKIP_WXH_NEON
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int w, int h,
- const uint8_t *second_pred) {
- // Only two accumulators are required for optimal instruction throughput of
- // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h;
- do {
- int j = 0;
- do {
- uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr + j);
- r0 = vld1q_u8(ref_ptr + j);
- p0 = vld1q_u8(second_pred);
- avg0 = vrhaddq_u8(r0, p0);
- diff0 = vabdq_u8(s0, avg0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- s1 = vld1q_u8(src_ptr + j + 16);
- r1 = vld1q_u8(ref_ptr + j + 16);
- p1 = vld1q_u8(second_pred + 16);
- avg1 = vrhaddq_u8(r1, p1);
- diff1 = vabdq_u8(s1, avg1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- j += 32;
- second_pred += 32;
- } while (j < w);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
- second_pred);
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
- second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
- second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h / 2;
- do {
- uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr);
- r0 = vld1q_u8(ref_ptr);
- p0 = vld1q_u8(second_pred);
- avg0 = vrhaddq_u8(r0, p0);
- diff0 = vabdq_u8(s0, avg0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- second_pred += 16;
-
- s1 = vld1q_u8(src_ptr);
- r1 = vld1q_u8(ref_ptr);
- p1 = vld1q_u8(second_pred);
- avg1 = vrhaddq_u8(r1, p1);
- diff1 = vabdq_u8(s1, avg1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- second_pred += 16;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -644,7 +448,7 @@
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
- uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
int i = h;
do {
@@ -653,24 +457,21 @@
uint8x16_t p0 = vld1q_u8(second_pred);
uint8x16_t avg0 = vrhaddq_u8(r0, p0);
uint8x16_t diff0 = vabdq_u8(s0, avg0);
- uint16x8_t sum0 = vpaddlq_u8(diff0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
uint8x16_t s1 = vld1q_u8(src_ptr + 16);
uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
uint8x16_t p1 = vld1q_u8(second_pred + 16);
uint8x16_t avg1 = vrhaddq_u8(r1, p1);
uint8x16_t diff1 = vabdq_u8(s1, avg1);
- uint16x8_t sum1 = vpaddlq_u8(diff1);
-
- sum = vpadalq_u16(sum, sum0);
- sum = vpadalq_u16(sum, sum1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
src_ptr += src_stride;
ref_ptr += ref_stride;
second_pred += 32;
} while (--i != 0);
- return horizontal_add_u32x4(sum);
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
}
static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
@@ -698,8 +499,6 @@
return horizontal_add_u16x8(sum);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -788,3 +587,287 @@
#endif // !CONFIG_REALTIME_ONLY
#undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+ uint8x16_t diff = vabdq_u8(s, wtd_avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000..5504c68
--- /dev/null
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to minimize the accumulation and loop carried
+ // dependencies for better instruction throughput.
+ uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[4] = vaddq_u32(sum[4], sum[5]);
+ sum[6] = vaddq_u32(sum[6], sum[7]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ sum[4] = vaddq_u32(sum[4], sum[6]);
+ sum[0] = vaddq_u32(sum[0], sum[4]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ uint8x16_t s1 = vld1q_u8(src_ptr);
+ uint8x16_t r1 = vld1q_u8(ref_ptr);
+ uint8x16_t p1 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \