Merge tag 'v3.7.1' into HEAD

libaom 3.7.1

2023-11-17 v3.7.1
  This release includes several bug fixes. This release is ABI
  compatible with the last release. See
  https://aomedia.googlesource.com/aom/+log/v3.7.0..v3.7.1 for all the
  commits in this release.

  - Bug Fixes
    * aomedia:3349: heap overflow when increasing resolution
    * aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on
      aom/av1/encoder/motion_search_facade.c
    * aomedia:3489: Detect encoder and image high bit depth mismatch
    * aomedia:3491: heap-buffer-overflow on frame size change
    * b/303023614:  Segfault at encoding time for high bit depth images

Bug: aomedia:3513
Change-Id: Iecf1f155b4f0ea2604ef27fef0d6111499ea9bad
diff --git a/.mailmap b/.mailmap
index 7d31a70..6d6e6302 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,12 +1,16 @@
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
 Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
-Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Alexander Bokov <alexanderbokov@google.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Andrey Norkin <anorkin@netflix.com>
 Angie Chiang <angiebird@google.com>
 Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
 Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Aasaipriya Chandran <aasaipriya.c@ittiam.com>
 Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
 Apurve Pandey <apurve.pandey@ittiam.com>
@@ -27,9 +31,10 @@
 Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
 Guillaume Martres <smarter@ubuntu.com>
 Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
-Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
 Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
 Hui Su <huisu@google.com>
 Iole Moccagatta <iole.moccagatta@gmail.com>
 Jacky Chen <jackychen@google.com>
@@ -40,13 +45,14 @@
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Kyle Siefring <siekyleb@amazon.com>
 Kyle Siefring <siekyleb@amazon.com> <kylesiefring@gmail.com>
 Lin Zheng <linzhen@google.com>
-Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Luc Trudeau <luc@trud.ca>
 Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
 Marco Paniconi <marpan@google.com>
@@ -56,6 +62,7 @@
 Mingliang Chen <mlchen@google.com>
 Monty Montgomery <cmontgomery@mozilla.com>
 Mudassir Galaganath <mudassir.galaganath@ittiam.com>
+Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
 Mudassir Galaganath <mudassir.galaganath@ittiam.com> Mudassir Galagnath
 Nathan E. Egge <negge@mozilla.com>
 Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
@@ -72,13 +79,14 @@
 Remya Prakasan <remya.prakasan@ittiam.com>
 Roger Zhou <youzhou@microsoft.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
-Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
 Ryan Lei <ryanlei@fb.com> <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
 Ryan Lei <ryanlei@fb.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
 Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
 Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
 Sarah Parker <sarahparker@google.com>
+Susanna D'Souza <susannad@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
@@ -90,14 +98,16 @@
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
 Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com> <vdziumenko@luxoft.corp-partner.google.com>
 Wei-Ting Lin <weitinglin@google.com>
 Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
 Wenyao Liu <wenyao.liu@cidana.com>
 Will Bresnahan <bill.wresnahan@gmail.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
-Yaowu Xu <yaowu@google.com> <Yaowu Xu>
-Yaowu Xu <yaowu@google.com> <yaowu.google.com>
 Zhipin Deng <zhipin.deng@intel.com>
 Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e0b65f..8e6ca6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -288,9 +288,9 @@
   add_library(aom_static STATIC ${target_objs_aom} $<TARGET_OBJECTS:aom_rtcd>)
   set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
   if(MSVC OR (WIN32 AND NOT MINGW))
-    # Fix race condition on the export library file between the two versions.
-    # Affects MSVC in all three flavors (stock, Clang/CL, LLVM-- the latter sets
-    # MSVC and MINGW both to FALSE).
+    # Fix race condition between the import library and the static library.
+    # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter
+    # sets MSVC and MINGW both to FALSE).
     set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll")
   endif()
 
@@ -323,7 +323,7 @@
   endif()
 endif()
 
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
   list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
               "${AOM_ROOT}/av1/ratectrl_rtc.cc")
   add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
@@ -336,7 +336,7 @@
 
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
 endif()
 if(BUILD_SHARED_LIBS)
@@ -387,13 +387,6 @@
   endif()
 endif()
 
-if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
-  add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
-                             $<TARGET_OBJECTS:aom_common_app_util>)
-  set_property(TARGET ${example} PROPERTY FOLDER examples)
-  list(APPEND AOM_APP_TARGETS resize_util)
-endif()
-
 if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
   add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
                         $<TARGET_OBJECTS:aom_common_app_util>
@@ -494,14 +487,18 @@
                                     $<TARGET_OBJECTS:aom_common_app_util>
                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
 
-    add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
-                                   $<TARGET_OBJECTS:aom_common_app_util>
-                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
-
     # Maintain a list of encoder example targets.
     list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
                 photon_noise_table set_maps simple_encoder scalable_encoder
-                twopass_encoder svc_encoder_rtc)
+                twopass_encoder)
+
+    if(NOT BUILD_SHARED_LIBS)
+      add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
+                                     $<TARGET_OBJECTS:aom_common_app_util>
+                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+      target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
+      list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc)
+    endif()
   endif()
 
   if(ENABLE_TOOLS)
@@ -852,7 +849,7 @@
 # Aomedia documentation rule.
 set(DOXYGEN_VERSION_VALUE 0)
 if(ENABLE_DOCS)
-  include(FindDoxygen)
+  find_package(Doxygen)
   if(DOXYGEN_FOUND)
     # Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
     set(MINIMUM_DOXYGEN_VERSION 1008010)
@@ -942,7 +939,8 @@
 get_cmake_property(all_cmake_vars VARIABLES)
 foreach(var ${all_cmake_vars})
   if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
-     AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST")
+     AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST"
+     AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER_")
     list(APPEND aom_source_vars ${var})
   endif()
 endforeach()
diff --git a/README.md b/README.md
index d7b66e0..4e2eb27 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@
 The toolchain files available at the time of this writing are:
 
  - arm64-ios.cmake
+ - arm64-linux-clang.cmake
  - arm64-linux-gcc.cmake
  - arm64-mingw-gcc.cmake
  - armv7-ios.cmake
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index e3d8d29..5d0bbe1 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -1006,11 +1006,11 @@
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 
 /*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
-#define AOM_USAGE_GOOD_QUALITY (0)
+#define AOM_USAGE_GOOD_QUALITY 0u
 /*!\brief usage parameter analogous to AV1 REALTIME mode. */
-#define AOM_USAGE_REALTIME (1)
+#define AOM_USAGE_REALTIME 1u
 /*!\brief usage parameter analogous to AV1 all intra mode. */
-#define AOM_USAGE_ALL_INTRA (2)
+#define AOM_USAGE_ALL_INTRA 2u
 
 /*!\brief Encode a frame
  *
diff --git a/aom/aomcx.h b/aom/aomcx.h
index a5db0a5..f061be3 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -208,14 +208,14 @@
    * encoding process, values greater than 0 will increase encoder speed at
    * the expense of quality.
    *
-   * Valid range: 0..10. 0 runs the slowest, and 10 runs the fastest;
+   * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest;
    * quality improves as speed decreases (since more compression
    * possibilities are explored).
    *
-   * NOTE: 10 is only allowed in AOM_USAGE_REALTIME. In AOM_USAGE_GOOD_QUALITY
-   * and AOM_USAGE_ALL_INTRA, 9 is the highest allowed value. However,
-   * AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, AOM_USAGE_REALTIME
-   * treats 0..4 the same as 5.
+   * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In
+   * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed
+   * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also,
+   * AOM_USAGE_REALTIME treats 0..4 the same as 5.
    */
   AOME_SET_CPUUSED = 13,
 
@@ -1527,6 +1527,12 @@
    */
   AV1E_SET_BITRATE_ONE_PASS_CBR = 163,
 
+  /*!\brief Codec control to set the maximum number of consecutive frame drops
+   * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of
+   * zero has no effect.
+   */
+  AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -1678,10 +1684,10 @@
 
 /*!brief Parameters for setting ref frame config */
 typedef struct aom_svc_ref_frame_config {
-  // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2),
-  // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  // 7 references: The index 0 - 6 refers to the references:
+  // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6).
   int reference[7]; /**< Reference flag for each of the 7 references. */
-  /*! Buffer slot index for each of 7 references. */
+  /*! Buffer slot index for each of 7 references indexed above. */
   int ref_idx[7];
   int refresh[8]; /**< Refresh flag for each of the 8 slots. */
 } aom_svc_ref_frame_config_t;
@@ -2172,6 +2178,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_BITRATE_ONE_PASS_CBR, unsigned int)
 #define AOM_CTRL_AV1E_SET_BITRATE_ONE_PASS_CBR
 
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
+#define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4c60e5c..f8f2cbb 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -115,12 +115,17 @@
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
 
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+
 if(CONFIG_AV1_HIGHBITDEPTH)
   list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
               "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
@@ -134,6 +139,11 @@
               "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
 
   list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
 endif()
 
@@ -191,6 +201,9 @@
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
                 "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+                "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
   endif()
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
@@ -269,7 +282,15 @@
               "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
 
   if(CONFIG_AV1_HIGHBITDEPTH)
     list(APPEND AOM_DSP_ENCODER_ASM_SSE2
@@ -292,11 +313,20 @@
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
                 "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c"
-                "${AOM_ROOT}/aom_dsp/arm/highbd_sad4d_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+                "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c")
   endif()
 
   if(CONFIG_INTERNAL_STATS)
@@ -326,6 +356,10 @@
 
     list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
                      "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+                     "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+                     "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
   endif()
 endif()
 
@@ -433,6 +467,23 @@
     endif()
   endif()
 
+  if(HAVE_NEON_DOTPROD)
+    add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD")
+    endif()
+  endif()
+
+  if(HAVE_NEON_I8MM)
+    add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_NEON_I8MM")
+  endif()
+
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
   if(BUILD_SHARED_LIBS)
     target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index efb634a..85dc005 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -23,10 +23,6 @@
 
 #define PI 3.141592653589793238462643383279502884
 
-#ifndef MAX_SB_SIZE
-#define MAX_SB_SIZE 128
-#endif  // ndef MAX_SB_SIZE
-
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
 #define AOMSIGN(x) ((x) < 0 ? -1 : 0)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e738971..c9b2682 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -497,22 +497,22 @@
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-specialize qw/aom_convolve_copy       neon sse2 avx2/;
-specialize qw/aom_convolve8_horiz     neon sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert      neon sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve_copy       neon                        sse2 avx2/;
+specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
 
 add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/aom_scaled_2d ssse3 neon/;
 
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
-  specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+  specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/;
 
   add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
-  specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
+  specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/;
 
   add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
-  specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+  specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/;
 }
 
 #
@@ -750,7 +750,7 @@
 add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
 add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
 add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
+specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/;
 specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
 specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
 
@@ -759,10 +759,10 @@
   add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
   add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
   add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
-  specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+  specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/;
 }
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -773,35 +773,33 @@
   specialize qw/aom_subtract_block neon sse2 avx2/;
 
   add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
-  specialize qw/aom_sse  sse4_1 avx2 neon/;
+  specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/;
 
   add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
-  specialize qw/aom_get_blk_sse_sum sse2 avx2/;
+  specialize qw/aom_get_blk_sse_sum sse2 avx2 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
     specialize qw/aom_highbd_subtract_block sse2 neon/;
 
     add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
-    specialize qw/aom_highbd_sse  sse4_1 avx2 neon/;
+    specialize qw/aom_highbd_sse sse4_1 avx2 neon/;
   }
 
-  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-    #
-    # Sum of Squares
-    #
-    add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
-    specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
+  #
+  # Sum of Squares
+  #
+  add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+  specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
 
-    add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
-    specialize qw/aom_sum_squares_i16 sse2 neon/;
+  add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+  specialize qw/aom_sum_squares_i16 sse2 neon/;
 
-    add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
-    specialize qw/aom_var_2d_u8 sse2 avx2 neon/;
+  add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+  specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/;
 
-    add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
-    specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
-  }
+  add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+  specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
 
   #
   # Single block SAD / Single block Avg SAD
@@ -816,65 +814,65 @@
 
   add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
   specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/;
-  specialize qw/aom_sad128x128    avx2 neon sse2/;
-  specialize qw/aom_sad128x64     avx2 neon sse2/;
-  specialize qw/aom_sad64x128     avx2 neon sse2/;
-  specialize qw/aom_sad64x64      avx2 neon sse2/;
-  specialize qw/aom_sad64x32      avx2 neon sse2/;
-  specialize qw/aom_sad32x64      avx2 neon sse2/;
-  specialize qw/aom_sad32x32      avx2 neon sse2/;
-  specialize qw/aom_sad32x16      avx2 neon sse2/;
-  specialize qw/aom_sad16x32           neon sse2/;
-  specialize qw/aom_sad16x16           neon sse2/;
-  specialize qw/aom_sad16x8            neon sse2/;
-  specialize qw/aom_sad8x16            neon sse2/;
-  specialize qw/aom_sad8x8             neon sse2/;
-  specialize qw/aom_sad8x4             neon sse2/;
-  specialize qw/aom_sad4x8             neon sse2/;
-  specialize qw/aom_sad4x4             neon sse2/;
+  specialize qw/aom_sad128x128    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16            sse2 neon/;
+  specialize qw/aom_sad8x8             sse2 neon/;
+  specialize qw/aom_sad8x4             sse2 neon/;
+  specialize qw/aom_sad4x8             sse2 neon/;
+  specialize qw/aom_sad4x4             sse2 neon/;
 
-  specialize qw/aom_sad4x16            neon sse2/;
-  specialize qw/aom_sad16x4            neon sse2/;
-  specialize qw/aom_sad8x32            neon sse2/;
-  specialize qw/aom_sad32x8            neon sse2/;
-  specialize qw/aom_sad16x64           neon sse2/;
-  specialize qw/aom_sad64x16           neon sse2/;
+  specialize qw/aom_sad4x16            sse2 neon/;
+  specialize qw/aom_sad16x4            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32            sse2 neon/;
+  specialize qw/aom_sad32x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16           sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x128     avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x64      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x32      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x64      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x32      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x16      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_16x32                    sse2  neon/;
-  specialize qw/aom_sad_skip_16x16                    sse2  neon/;
-  specialize qw/aom_sad_skip_16x8                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x16                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x8                      sse2  neon/;
-  specialize qw/aom_sad_skip_8x4                            neon/;
-  specialize qw/aom_sad_skip_4x8                      sse2  neon/;
-  specialize qw/aom_sad_skip_4x4                            neon/;
+  specialize qw/aom_sad_skip_128x128    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_128x64     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x128     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x16      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x32           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x16           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x16            sse2 neon/;
+  specialize qw/aom_sad_skip_8x8             sse2 neon/;
+  specialize qw/aom_sad_skip_8x4                  neon/;
+  specialize qw/aom_sad_skip_4x8             sse2 neon/;
+  specialize qw/aom_sad_skip_4x4                  neon/;
 
-  specialize qw/aom_sad_skip_4x16                     sse2  neon/;
-  specialize qw/aom_sad_skip_16x4                           neon/;
-  specialize qw/aom_sad_skip_8x32                     sse2  neon/;
-  specialize qw/aom_sad_skip_32x8                     sse2  neon/;
-  specialize qw/aom_sad_skip_16x64                    sse2  neon/;
-  specialize qw/aom_sad_skip_64x16                    sse2  neon/;
+  specialize qw/aom_sad_skip_4x16            sse2 neon/;
+  specialize qw/aom_sad_skip_16x4                 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x32            sse2 neon/;
+  specialize qw/aom_sad_skip_32x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x64           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x16           sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
-  specialize qw/aom_sad128x64_avg  avx2 sse2 neon/;
-  specialize qw/aom_sad64x128_avg  avx2 sse2 neon/;
-  specialize qw/aom_sad64x64_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad64x32_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x64_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x32_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x16_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad16x32_avg        sse2 neon/;
-  specialize qw/aom_sad16x16_avg        sse2 neon/;
-  specialize qw/aom_sad16x8_avg         sse2 neon/;
+  specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8_avg         sse2 neon neon_dotprod/;
   specialize qw/aom_sad8x16_avg         sse2 neon/;
   specialize qw/aom_sad8x8_avg          sse2 neon/;
   specialize qw/aom_sad8x4_avg          sse2 neon/;
@@ -882,36 +880,36 @@
   specialize qw/aom_sad4x4_avg          sse2 neon/;
 
   specialize qw/aom_sad4x16_avg         sse2 neon/;
-  specialize qw/aom_sad16x4_avg         sse2 neon/;
+  specialize qw/aom_sad16x4_avg         sse2 neon neon_dotprod/;
   specialize qw/aom_sad8x32_avg         sse2 neon/;
-  specialize qw/aom_sad32x8_avg         sse2 neon/;
-  specialize qw/aom_sad16x64_avg        sse2 neon/;
-  specialize qw/aom_sad64x16_avg        sse2 neon/;
+  specialize qw/aom_sad32x8_avg         sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16_avg        sse2 neon neon_dotprod/;
 
-  specialize qw/aom_dist_wtd_sad128x128_avg sse2/;
-  specialize qw/aom_dist_wtd_sad128x64_avg  sse2/;
-  specialize qw/aom_dist_wtd_sad64x128_avg  sse2/;
-  specialize qw/aom_dist_wtd_sad64x64_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad64x32_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad32x64_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad32x32_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad32x16_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad16x32_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad16x16_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad16x8_avg    sse2/;
-  specialize qw/aom_dist_wtd_sad8x16_avg    sse2/;
-  specialize qw/aom_dist_wtd_sad8x8_avg     sse2/;
-  specialize qw/aom_dist_wtd_sad8x4_avg     sse2/;
-  specialize qw/aom_dist_wtd_sad4x8_avg     sse2/;
-  specialize qw/aom_dist_wtd_sad4x4_avg     sse2/;
+  specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad128x64_avg  sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x128_avg  sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x64_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x64_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x16_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x16_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x8_avg    sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad8x16_avg    sse2 neon/;
+  specialize qw/aom_dist_wtd_sad8x8_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad8x4_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad4x8_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad4x4_avg     sse2 neon/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    specialize qw/aom_dist_wtd_sad4x16_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad16x4_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad8x32_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad32x8_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad16x64_avg    sse2/;
-    specialize qw/aom_dist_wtd_sad64x16_avg    sse2/;
+    specialize qw/aom_dist_wtd_sad4x16_avg     sse2 neon/;
+    specialize qw/aom_dist_wtd_sad16x4_avg     sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad8x32_avg     sse2 neon/;
+    specialize qw/aom_dist_wtd_sad32x8_avg     sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad16x64_avg    sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad64x16_avg    sse2 neon neon_dotprod/;
   }
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -974,27 +972,29 @@
     specialize qw/aom_highbd_sad_skip_16x64   avx2 sse2 neon/;
     specialize qw/aom_highbd_sad_skip_64x16   avx2 sse2 neon/;
 
-    specialize qw/aom_highbd_sad128x128_avg avx2/;
-    specialize qw/aom_highbd_sad128x64_avg  avx2/;
-    specialize qw/aom_highbd_sad64x128_avg  avx2/;
-    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x4_avg     sse2/;
-    specialize qw/aom_highbd_sad4x8_avg     sse2/;
-    specialize qw/aom_highbd_sad4x4_avg     sse2/;
+    specialize qw/aom_highbd_sad128x128_avg avx2      neon/;
+    specialize qw/aom_highbd_sad128x64_avg  avx2      neon/;
+    specialize qw/aom_highbd_sad64x128_avg  avx2      neon/;
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad8x16_avg              neon/;
+    specialize qw/aom_highbd_sad8x8_avg               neon/;
+    specialize qw/aom_highbd_sad8x4_avg          sse2 neon/;
+    specialize qw/aom_highbd_sad4x8_avg          sse2 neon/;
+    specialize qw/aom_highbd_sad4x4_avg          sse2 neon/;
 
-    specialize qw/aom_highbd_sad4x16_avg    sse2/;
-    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x32_avg    sse2/;
-    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad4x16_avg         sse2 neon/;
+    specialize qw/aom_highbd_sad8x32_avg         sse2 neon/;
+    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2 neon/;
   }
   #
   # Masked SAD
@@ -1009,7 +1009,7 @@
     foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
-      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
     }
   }
 
@@ -1030,7 +1030,7 @@
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
         if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
         }
       }
     }
@@ -1047,47 +1047,47 @@
     add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
   }
 
-  specialize qw/aom_sad128x128x4d avx2 neon sse2/;
-  specialize qw/aom_sad128x64x4d  avx2 neon sse2/;
-  specialize qw/aom_sad64x128x4d  avx2 neon sse2/;
-  specialize qw/aom_sad64x64x4d   avx2 neon sse2/;
-  specialize qw/aom_sad64x32x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x64x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x32x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x32x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x8x4d    avx2 neon sse2/;
+  specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8x4d    avx2 sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad8x16x4d         neon sse2/;
-  specialize qw/aom_sad8x8x4d          neon sse2/;
-  specialize qw/aom_sad8x4x4d          neon sse2/;
-  specialize qw/aom_sad4x8x4d          neon sse2/;
-  specialize qw/aom_sad4x4x4d          neon sse2/;
+  specialize qw/aom_sad8x16x4d         sse2 neon/;
+  specialize qw/aom_sad8x8x4d          sse2 neon/;
+  specialize qw/aom_sad8x4x4d          sse2 neon/;
+  specialize qw/aom_sad4x8x4d          sse2 neon/;
+  specialize qw/aom_sad4x4x4d          sse2 neon/;
 
-  specialize qw/aom_sad64x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x8x4d    avx2 neon sse2/;
-  specialize qw/aom_sad16x64x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x4x4d    avx2 neon sse2/;
-  specialize qw/aom_sad8x32x4d         neon sse2/;
-  specialize qw/aom_sad4x16x4d         neon sse2/;
+  specialize qw/aom_sad64x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x8x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x4x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32x4d         sse2 neon/;
+  specialize qw/aom_sad4x16x4d         sse2 neon/;
 
-  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x4x4d              neon/;
+  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x4x4d              neon neon_dotprod/;
   specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
@@ -1096,29 +1096,29 @@
   specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
   specialize qw/aom_sad_skip_4x4x4d               neon/;
 
-  specialize qw/aom_sad128x128x3d neon avx2/;
-  specialize qw/aom_sad128x64x3d  neon avx2/;
-  specialize qw/aom_sad64x128x3d  neon avx2/;
-  specialize qw/aom_sad64x64x3d   neon avx2/;
-  specialize qw/aom_sad64x32x3d   neon avx2/;
-  specialize qw/aom_sad32x64x3d   neon avx2/;
-  specialize qw/aom_sad32x32x3d   neon avx2/;
-  specialize qw/aom_sad32x16x3d   neon avx2/;
-  specialize qw/aom_sad16x32x3d   neon avx2/;
-  specialize qw/aom_sad16x16x3d   neon avx2/;
-  specialize qw/aom_sad16x8x3d    neon avx2/;
-  specialize qw/aom_sad8x16x3d    neon/;
-  specialize qw/aom_sad8x8x3d     neon/;
-  specialize qw/aom_sad8x4x3d     neon/;
-  specialize qw/aom_sad4x8x3d     neon/;
-  specialize qw/aom_sad4x4x3d     neon/;
+  specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64x3d  avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128x3d  avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8x3d    avx2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16x3d         neon/;
+  specialize qw/aom_sad8x8x3d          neon/;
+  specialize qw/aom_sad8x4x3d          neon/;
+  specialize qw/aom_sad4x8x3d          neon/;
+  specialize qw/aom_sad4x4x3d          neon/;
 
-  specialize qw/aom_sad64x16x3d   neon avx2/;
-  specialize qw/aom_sad32x8x3d    neon avx2/;
-  specialize qw/aom_sad16x64x3d   neon avx2/;
-  specialize qw/aom_sad16x4x3d    neon/;
-  specialize qw/aom_sad8x32x3d    neon/;
-  specialize qw/aom_sad4x16x3d    neon/;
+  specialize qw/aom_sad64x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x8x3d    avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x4x3d         neon neon_dotprod/;
+  specialize qw/aom_sad8x32x3d         neon/;
+  specialize qw/aom_sad4x16x3d         neon/;
 
   specialize qw/aom_masked_sad128x128x4d  ssse3 neon/;
   specialize qw/aom_masked_sad128x64x4d   ssse3 neon/;
@@ -1153,9 +1153,9 @@
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
       if ($w != 128 && $h != 128) {
         specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
       }
@@ -1208,22 +1208,29 @@
     specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2 neon/;
     specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2 neon/;
 
-    specialize qw/aom_highbd_sad128x128x3d avx2/;
-    specialize qw/aom_highbd_sad128x64x3d  avx2/;
-    specialize qw/aom_highbd_sad64x128x3d  avx2/;
-    specialize qw/aom_highbd_sad64x64x3d   avx2/;
-    specialize qw/aom_highbd_sad64x32x3d   avx2/;
-    specialize qw/aom_highbd_sad32x64x3d   avx2/;
-    specialize qw/aom_highbd_sad32x32x3d   avx2/;
-    specialize qw/aom_highbd_sad32x16x3d   avx2/;
-    specialize qw/aom_highbd_sad16x32x3d   avx2/;
-    specialize qw/aom_highbd_sad16x16x3d   avx2/;
-    specialize qw/aom_highbd_sad16x8x3d    avx2/;
+    specialize qw/aom_highbd_sad128x128x3d avx2 neon/;
+    specialize qw/aom_highbd_sad128x64x3d  avx2 neon/;
+    specialize qw/aom_highbd_sad64x128x3d  avx2 neon/;
+    specialize qw/aom_highbd_sad64x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad64x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x8x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad8x16x3d         neon/;
+    specialize qw/aom_highbd_sad8x8x3d          neon/;
+    specialize qw/aom_highbd_sad8x4x3d          neon/;
+    specialize qw/aom_highbd_sad4x8x3d          neon/;
+    specialize qw/aom_highbd_sad4x4x3d          neon/;
 
-    specialize qw/aom_highbd_sad16x4x3d    avx2/;
-    specialize qw/aom_highbd_sad32x8x3d    avx2/;
-    specialize qw/aom_highbd_sad16x64x3d   avx2/;
-    specialize qw/aom_highbd_sad64x16x3d   avx2/;
+    specialize qw/aom_highbd_sad64x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x8x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad16x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x4x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad8x32x3d         neon/;
+    specialize qw/aom_highbd_sad4x16x3d         neon/;
   }
   #
   # Avg
@@ -1323,20 +1330,20 @@
   # Specialty Variance
   #
   add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
-  specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon/;
+  specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon neon_dotprod/;
 
   add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
-  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2 sse2 neon/;
+  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-  specialize qw/aom_mse16x16          sse2 avx2 neon/;
-  specialize qw/aom_mse16x8           sse2      neon/;
-  specialize qw/aom_mse8x16           sse2      neon/;
-  specialize qw/aom_mse8x8            sse2      neon/;
+  specialize qw/aom_mse16x16          sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_mse16x8           sse2      neon neon_dotprod/;
+  specialize qw/aom_mse8x16           sse2      neon neon_dotprod/;
+  specialize qw/aom_mse8x8            sse2      neon neon_dotprod/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
@@ -1345,31 +1352,32 @@
       add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
       add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-      specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
-      specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+      specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon/;
+      specialize "aom_highbd_${bd}_mse16x8", qw/neon/;
+      specialize "aom_highbd_${bd}_mse8x16", qw/neon/;
+      specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon/;
     }
+
+    specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
   }
 
   #
   #
   #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
-  specialize qw/aom_get_mb_ss sse2/;
+  specialize qw/aom_get_mb_ss sse2 neon/;
 
   #
   # Variance / Subpixel Variance / Subpixel Avg Variance
   #
-  add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
   add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
   specialize qw/aom_mse_wxh_16bit  sse2 avx2 neon/;
 
   add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
-  specialize qw/aom_mse_16xh_16bit sse2 avx2/;
+  specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
 
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
@@ -1378,22 +1386,22 @@
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
-  specialize qw/aom_variance128x128   sse2 avx2 neon/;
-  specialize qw/aom_variance128x64    sse2 avx2 neon/;
-  specialize qw/aom_variance64x128    sse2 avx2 neon/;
-  specialize qw/aom_variance64x64     sse2 avx2 neon/;
-  specialize qw/aom_variance64x32     sse2 avx2 neon/;
-  specialize qw/aom_variance32x64     sse2 avx2 neon/;
-  specialize qw/aom_variance32x32     sse2 avx2 neon/;
-  specialize qw/aom_variance32x16     sse2 avx2 neon/;
-  specialize qw/aom_variance16x32     sse2 avx2 neon/;
-  specialize qw/aom_variance16x16     sse2 avx2 neon/;
-  specialize qw/aom_variance16x8      sse2 avx2 neon/;
-  specialize qw/aom_variance8x16      sse2      neon/;
-  specialize qw/aom_variance8x8       sse2      neon/;
-  specialize qw/aom_variance8x4       sse2      neon/;
-  specialize qw/aom_variance4x8       sse2      neon/;
-  specialize qw/aom_variance4x4       sse2      neon/;
+  specialize qw/aom_variance128x128   sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance128x64    sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x128    sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x64     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x16     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x16     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance8x16      sse2      neon neon_dotprod/;
+  specialize qw/aom_variance8x8       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance8x4       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance4x8       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance4x4       sse2      neon neon_dotprod/;
 
   specialize qw/aom_sub_pixel_variance128x128   avx2 neon sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance128x64    avx2 neon sse2 ssse3/;
@@ -1430,12 +1438,12 @@
   specialize qw/aom_sub_pixel_avg_variance4x4          neon sse2 ssse3/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    specialize qw/aom_variance4x16  neon sse2/;
-    specialize qw/aom_variance16x4  neon sse2 avx2/;
-    specialize qw/aom_variance8x32  neon sse2/;
-    specialize qw/aom_variance32x8  neon sse2 avx2/;
-    specialize qw/aom_variance16x64 neon sse2 avx2/;
-    specialize qw/aom_variance64x16 neon sse2 avx2/;
+    specialize qw/aom_variance4x16  neon neon_dotprod sse2/;
+    specialize qw/aom_variance16x4  neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance8x32  neon neon_dotprod sse2/;
+    specialize qw/aom_variance32x8  neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
 
     specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
@@ -1450,82 +1458,259 @@
     specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
 
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/;
   }
 
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   neon ssse3/;
 
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   neon ssse3/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
       foreach (@encoder_block_sizes) {
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
         add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
         add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-        if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-          if ($bd == 10) {
-            specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse2 neon/;
-          } else {
-            specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
-          }
-        }
-
-        if ($w == 4 || $h == 4) {
-          # TODO(rachelbarker): When ext-partition-types is enabled, we currently
-          # don't have vectorized 4x16 highbd variance functions
-          if ($w == 4 && $h == 4) {
-            if ($bd == 10) {
-              specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse4_1 neon/;
-            } else {
-              specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
-            }
-          } else {
-            if ($bd == 10) {
-              specialize "aom_highbd_${bd}_variance${w}x${h}", qw/neon/;
-            }
-          }
-        }
-
-
-        if ($w != 128 && $h != 128 && $w != 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
-        }
-        if ($w == 4 && $h == 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
-        }
-
         add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
       }
     }
+
+    specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_12_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_12_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_12_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_variance8x4          neon/;
+    specialize qw/aom_highbd_12_variance4x8          neon/;
+    specialize qw/aom_highbd_12_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance128x64  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance64x128  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance64x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance64x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance32x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance32x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance32x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance16x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance16x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance16x8    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance8x16    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance8x8     sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance8x4               neon/;
+    specialize qw/aom_highbd_10_variance4x8               neon/;
+    specialize qw/aom_highbd_10_variance4x4   sse4_1      neon/;
+
+    specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_8_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_8_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_8_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_variance8x4          neon/;
+    specialize qw/aom_highbd_8_variance4x8          neon/;
+    specialize qw/aom_highbd_8_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        my $avx2 = ($bd == 10) ? "avx2" : "";
+        specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
+        specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
+      }
+    }
+
+    specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance4x8          neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance128x64  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x128  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x8    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x16    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x8     sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x4     sse2      neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance4x8               neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance4x4   sse4_1      neon/;
+
+    specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance4x8          neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/;
+      }
+    }
+
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/;
+      }
+    }
+
+    foreach $bd (8, 10, 12) {
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8"   , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16"   , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4"    , qw/neon/;
+    }
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/;
+      }
+    }
   }
   #
   # Masked Variance / Masked Subpixel Variance
@@ -1541,7 +1726,7 @@
       foreach (@encoder_block_sizes) {
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
-        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
       }
     }
   }
@@ -1559,56 +1744,18 @@
     }
 
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      foreach $bd ("_", "_10_", "_12_") {
+      foreach $bd ("_8_", "_10_", "_12_") {
         foreach (@encoder_block_sizes) {
           ($w, $h) = @$_;
           add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
           add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/;
+          specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/;
         }
       }
     }
   }
 
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x32 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x64 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x16 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x32 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x16 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x8 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x16 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x8 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x4 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x8 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x4 sse2 ssse3/;
-
   #
   # Comp Avg
   #
@@ -1616,469 +1763,25 @@
   specialize qw/aom_comp_avg_pred avx2 neon/;
 
   add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
-  specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
+  specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-
-    add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance128x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance4x8 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance4x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance4x8 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance4x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance128x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance4x8 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance4x4 neon/;
-
-    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-      foreach $bd (8, 10, 12) {
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance64x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance64x16" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance32x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance32x8" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x64", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance16x64" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance8x32", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance8x32" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
-      }
-    }
-
-    add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse16x8 neon/;
-    add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse8x16 neon/;
-    add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse16x8 neon/;
-    add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse8x16 neon/;
-    add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse16x8 neon/;
-    add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse8x16 neon/;
-    add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse8x8 sse2 neon/;
-
     add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+    specialize qw/aom_highbd_comp_avg_pred neon/;
 
     add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
-    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/;
 
     add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
-    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2/;
+    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2 neon/;
   }
-    #
-    # Subpixel Variance
-    #
-    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    }
-
 
   add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
   specialize qw/aom_comp_mask_pred ssse3 avx2 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-    specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+    specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/;
   }
 
   # Flow estimation library
@@ -2087,7 +1790,7 @@
     specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
 
     add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
-    specialize qw/aom_compute_flow_at_point sse4_1/;
+    specialize qw/aom_compute_flow_at_point sse4_1 neon/;
   }
 
 }  # CONFIG_AV1_ENCODER
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h
index ab950ca..69da8f2 100644
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -24,12 +24,10 @@
 
 #define SIMD_CHECK 1  // Sanity checks in C equivalents
 
-#if HAVE_NEON
-#include "simd/v256_intrinsics_arm.h"
 // VS compiling for 32 bit targets does not support vector types in
 // structs as arguments, which makes the v256 type of the intrinsics
 // hard to support, so optimizations for this target are disabled.
-#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
 #include "simd/v256_intrinsics_x86.h"
 #else
 #include "simd/v256_intrinsics.h"
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 3d07a0f..c8ee780 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -24,826 +24,6 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-#if AOM_ARCH_AARCH64 && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
-  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
-  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
-  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
-};
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
-                                          const int8x8_t filter,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
-                                          const int8x8_t filter,
-                                          const uint8x16x3_t permute_tbl) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
-  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
-  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filter) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum;
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
-  sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
-                                                  const uint8x16_t samples0_hi,
-                                                  const uint8x16_t samples1_lo,
-                                                  const uint8x16_t samples1_hi,
-                                                  const int8x8_t filter) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
-  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
-  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filter);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filter);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filter);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-#else  // !defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
-                                         const int8x8_t filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
-                                         const int8x8_t filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filter) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
-  sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
-                                                 const int8x16_t samples0_hi,
-                                                 const int8x16_t samples1_lo,
-                                                 const int8x16_t samples1_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filter) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
-  sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
-  sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filter);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filter);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filter);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
-
-#else  // !(AOM_ARCH_AARCH64 &&
-       //   (defined(__ARM_FEATURE_DOTPROD) ||
-       //    defined(__ARM_FEATURE_MATMUL_INT8)))
-
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
@@ -905,7 +85,7 @@
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -918,7 +98,7 @@
 
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -931,7 +111,7 @@
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      transpose_u8_4x4(&d01, &d23);
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
 
       store_u8_4x1(dst + 0 * dst_stride, d01, 0);
       store_u8_4x1(dst + 1 * dst_stride, d23, 0);
@@ -956,7 +136,7 @@
     if (w == 4) {
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -967,7 +147,8 @@
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
-        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+        transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                               &t3);
         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -978,7 +159,7 @@
         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
-        transpose_u8_8x4(&d0, &d1, &d2, &d3);
+        transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
 
         store_u8_4x1(dst + 0 * dst_stride, d0, 0);
         store_u8_4x1(dst + 1 * dst_stride, d1, 0);
@@ -1002,7 +183,7 @@
 
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1017,7 +198,8 @@
 
         do {
           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                                         &t7);
           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1036,7 +218,8 @@
           d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
           d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
-          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+          transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                                         &d7);
 
           store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
@@ -1172,5 +355,3 @@
     } while (w != 0);
   }
 }
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 0000000..e565414
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+                                         const int8x8_t filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+                                         const int8x8_t filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filter) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+                                                 const int8x16_t samples0_hi,
+                                                 const int8x16_t samples1_lo,
+                                                 const int8x16_t samples1_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filter) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filter);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filter);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filter);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 0000000..d778e8a
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
+                                          const int8x8_t filter,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+                                          const int8x8_t filter,
+                                          const uint8x16x3_t permute_tbl) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filter) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum;
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+                                                  const uint8x16_t samples0_hi,
+                                                  const uint8x16_t samples1_lo,
+                                                  const uint8x16_t samples1_hi,
+                                                  const int8x8_t filter) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filter);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filter);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filter);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index 583d832..d746f9e 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -50,3 +50,104 @@
     }
   }
 }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
+    do {
+      s0 = vld1_u16(src);
+      src += src_stride;
+      s1 = vld1_u16(src);
+      src += src_stride;
+
+      vst1_u16(dst, s0);
+      dst += dst_stride;
+      vst1_u16(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
+    do {
+      s0 = vld1q_u16(src);
+      src += src_stride;
+      s1 = vld1q_u16(src);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      dst += dst_stride;
+      vst1q_u16(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      src += src_stride;
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      dst += dst_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    do {
+      const uint16_t *s = src;
+      uint16_t *d = dst;
+      int width = w;
+      do {
+        s0 = vld1q_u16(s);
+        s1 = vld1q_u16(s + 8);
+        s2 = vld1q_u16(s + 16);
+        s3 = vld1q_u16(s + 24);
+        s4 = vld1q_u16(s + 32);
+        s5 = vld1q_u16(s + 40);
+        s6 = vld1q_u16(s + 48);
+        s7 = vld1q_u16(s + 56);
+
+        vst1q_u16(d, s0);
+        vst1q_u16(d + 8, s1);
+        vst1q_u16(d + 16, s2);
+        vst1q_u16(d + 24, s3);
+        vst1q_u16(d + 32, s4);
+        vst1q_u16(d + 40, s5);
+        vst1q_u16(d + 48, s6);
+        vst1q_u16(d + 56, s7);
+        s += 64;
+        d += 64;
+        width -= 64;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index ef2f3af..2e79b2e 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <stdlib.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
@@ -19,75 +20,68 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-#if !AOM_ARCH_AARCH64
-static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
-  const uint32x4_t b = vpaddlq_u16(a);
-  const uint64x2_t c = vpaddlq_u32(b);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                  vreinterpret_u32_u64(vget_high_u64(c)));
-}
-#endif
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+  const uint8x8_t s0 = load_unaligned_u8(p, stride);
+  const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
 
-unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
-  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
-  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-#if AOM_ARCH_AARCH64
-  const uint32_t d = vaddlvq_u16(c);
-  return (d + 8) >> 4;
-#else
-  const uint32x2_t d = horizontal_add_u16x8_v(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
-#endif
+  const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+  return (sum + (1 << 3)) >> 4;
 }
 
-unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
-  uint16x8_t sum;
-  uint8x8_t b = vld1_u8(a);
-  a += a_stride;
-  uint8x8_t c = vld1_u8(a);
-  a += a_stride;
-  sum = vaddl_u8(b, c);
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+  uint8x8_t s0 = vld1_u8(p);
+  p += stride;
+  uint8x8_t s1 = vld1_u8(p);
+  p += stride;
+  uint16x8_t acc = vaddl_u8(s0, s1);
 
-  for (int i = 0; i < 6; ++i) {
-    const uint8x8_t e = vld1_u8(a);
-    a += a_stride;
-    sum = vaddw_u8(sum, e);
-  }
+  int i = 0;
+  do {
+    const uint8x8_t si = vld1_u8(p);
+    p += stride;
+    acc = vaddw_u8(acc, si);
+  } while (++i < 6);
 
-#if AOM_ARCH_AARCH64
-  const uint32_t d = vaddlvq_u16(sum);
-  return (d + 32) >> 6;
-#else
-  const uint32x2_t d = horizontal_add_u16x8_v(sum);
-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
-#endif
+  const uint32_t sum = horizontal_add_u16x8(acc);
+  return (sum + (1 << 5)) >> 6;
 }
 
 void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
-  for (int k = 0; k < 4; k++) {
-    const int x8_idx = x16_idx + ((k & 1) << 3);
-    const int y8_idx = y16_idx + ((k >> 1) << 3);
-    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
-    avg[k] = aom_avg_8x8_neon(s_tmp, p);
-  }
+  avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+  avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+  avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+  avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
 }
 
 int aom_satd_lp_neon(const int16_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
+  int16x8_t s0 = vld1q_s16(coeff);
+  int16x8_t s1 = vld1q_s16(coeff + 8);
 
-  do {
-    const int16x8_t src0 = vld1q_s16(coeff);
-    const int16x8_t src8 = vld1q_s16(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+  int16x8_t abs0 = vabsq_s16(s0);
+  int16x8_t abs1 = vabsq_s16(s1);
+
+  int32x4_t acc0 = vpaddlq_s16(abs0);
+  int32x4_t acc1 = vpaddlq_s16(abs1);
+
+  length -= 16;
+  coeff += 16;
+
+  while (length != 0) {
+    s0 = vld1q_s16(coeff);
+    s1 = vld1q_s16(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    abs1 = vabsq_s16(s1);
+
+    acc0 = vpadalq_s16(acc0, abs0);
+    acc1 = vpadalq_s16(acc1, abs1);
+
     length -= 16;
     coeff += 16;
-  } while (length != 0);
+  }
 
+  int32x4_t accum = vaddq_s32(acc0, acc1);
   return horizontal_add_s32x4(accum);
 }
 
@@ -180,56 +174,84 @@
   } while (h < height);
 }
 
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
 int aom_satd_neon(const tran_low_t *coeff, int length) {
   const int32x4_t zero = vdupq_n_s32(0);
-  int32x4_t accum = zero;
-  do {
-    const int32x4_t src0 = vld1q_s32(&coeff[0]);
-    const int32x4_t src8 = vld1q_s32(&coeff[4]);
-    const int32x4_t src16 = vld1q_s32(&coeff[8]);
-    const int32x4_t src24 = vld1q_s32(&coeff[12]);
-    accum = vabaq_s32(accum, src0, zero);
-    accum = vabaq_s32(accum, src8, zero);
-    accum = vabaq_s32(accum, src16, zero);
-    accum = vabaq_s32(accum, src24, zero);
+
+  int32x4_t s0 = vld1q_s32(&coeff[0]);
+  int32x4_t s1 = vld1q_s32(&coeff[4]);
+  int32x4_t s2 = vld1q_s32(&coeff[8]);
+  int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+  int32x4_t accum0 = vabsq_s32(s0);
+  int32x4_t accum1 = vabsq_s32(s2);
+  accum0 = vabaq_s32(accum0, s1, zero);
+  accum1 = vabaq_s32(accum1, s3, zero);
+
+  length -= 16;
+  coeff += 16;
+
+  while (length != 0) {
+    s0 = vld1q_s32(&coeff[0]);
+    s1 = vld1q_s32(&coeff[4]);
+    s2 = vld1q_s32(&coeff[8]);
+    s3 = vld1q_s32(&coeff[12]);
+
+    accum0 = vabaq_s32(accum0, s0, zero);
+    accum1 = vabaq_s32(accum1, s1, zero);
+    accum0 = vabaq_s32(accum0, s2, zero);
+    accum1 = vabaq_s32(accum1, s3, zero);
+
     length -= 16;
     coeff += 16;
-  } while (length != 0);
+  }
 
-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return horizontal_add_s32x4(accum);
+  // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+  return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
 }
 
 int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
-  int32x4_t v_mean = vdupq_n_s32(0);
-  int32x4_t v_sse = v_mean;
-  int16x8_t v_ref, v_src;
-  int16x4_t v_low;
+  assert(bwl >= 2 && bwl <= 5);
+  int width = 4 << bwl;
 
-  int i, width = 4 << bwl;
-  for (i = 0; i < width; i += 8) {
-    v_ref = vld1q_s16(&ref[i]);
-    v_src = vld1q_s16(&src[i]);
-    const int16x8_t diff = vsubq_s16(v_ref, v_src);
-    // diff: dynamic range [-510, 510], 10 bits.
-    v_mean = vpadalq_s16(v_mean, diff);
-    v_low = vget_low_s16(diff);
-    v_sse = vmlal_s16(v_sse, v_low, v_low);
-#if AOM_ARCH_AARCH64
-    v_sse = vmlal_high_s16(v_sse, diff, diff);
-#else
-    const int16x4_t v_high = vget_high_s16(diff);
-    v_sse = vmlal_s16(v_sse, v_high, v_high);
-#endif
-  }
-  const int mean = horizontal_add_s32x4(v_mean);
-  const int sse = horizontal_add_s32x4(v_sse);
-  const unsigned int mean_abs = mean >= 0 ? mean : -mean;
-  // (mean * mean): dynamic range 31 bits.
-  const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
-  return var;
+  int16x8_t r = vld1q_s16(ref);
+  int16x8_t s = vld1q_s16(src);
+
+  // diff: dynamic range [-510, 510] 10 (signed) bits.
+  int16x8_t diff = vsubq_s16(r, s);
+  // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+  int16x8_t v_mean = diff;
+  // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+  int32x4_t v_sse[2];
+  v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+  v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+  ref += 8;
+  src += 8;
+  width -= 8;
+
+  do {
+    r = vld1q_s16(ref);
+    s = vld1q_s16(src);
+
+    diff = vsubq_s16(r, s);
+    v_mean = vaddq_s16(v_mean, diff);
+
+    v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+    v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+    ref += 8;
+    src += 8;
+    width -= 8;
+  } while (width != 0);
+
+  // Dynamic range [0, 65280], 16 (unsigned) bits.
+  const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+  const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+  // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+  return sse - ((mean_abs * mean_abs) >> (bwl + 2));
 }
 
 void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
diff --git a/aom_dsp/arm/avg_pred_neon.c b/aom_dsp/arm/avg_pred_neon.c
index 04e0904..b17f7fc 100644
--- a/aom_dsp/arm/avg_pred_neon.c
+++ b/aom_dsp/arm/avg_pred_neon.c
@@ -13,6 +13,9 @@
 #include <assert.h>
 
 #include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/blend.h"
 
@@ -74,6 +77,75 @@
   }
 }
 
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+                                     int width, int height, const uint8_t *ref,
+                                     int ref_stride,
+                                     const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  if (width > 8) {
+    do {
+      const uint8_t *pred_ptr = pred;
+      const uint8_t *ref_ptr = ref;
+      uint8_t *comp_pred_ptr = comp_pred;
+      int w = width;
+
+      do {
+        const uint8x16_t p = vld1q_u8(pred_ptr);
+        const uint8x16_t r = vld1q_u8(ref_ptr);
+
+        const uint8x16_t wtd_avg =
+            dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+        vst1q_u8(comp_pred_ptr, wtd_avg);
+
+        ref_ptr += 16;
+        pred_ptr += 16;
+        comp_pred_ptr += 16;
+        w -= 16;
+      } while (w != 0);
+
+      ref += ref_stride;
+      pred += width;
+      comp_pred += width;
+    } while (--height != 0);
+  } else if (width == 8) {
+    int h = height / 2;
+
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+      const uint8x16_t wtd_avg =
+          dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+      vst1q_u8(comp_pred, wtd_avg);
+
+      ref += 2 * ref_stride;
+      pred += 16;
+      comp_pred += 16;
+    } while (--h != 0);
+  } else {
+    int h = height / 2;
+    assert(width == 4);
+
+    do {
+      const uint8x8_t p = vld1_u8(pred);
+      const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+      const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+                                                  vget_low_u8(bck_offset));
+
+      vst1_u8(comp_pred, wtd_avg);
+
+      ref += 2 * ref_stride;
+      pred += 8;
+      comp_pred += 8;
+    } while (--h != 0);
+  }
+}
+
 void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
                              int height, const uint8_t *ref, int ref_stride,
                              const uint8_t *mask, int mask_stride,
@@ -84,7 +156,6 @@
   const int src_stride1 = invert_mask ? ref_stride : width;
 
   if (width > 8) {
-    const uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
     do {
       const uint8_t *src0_ptr = src0;
       const uint8_t *src1_ptr = src1;
@@ -97,19 +168,7 @@
         const uint8x16_t s1 = vld1q_u8(src1_ptr);
         const uint8x16_t m0 = vld1q_u8(mask_ptr);
 
-        uint8x16_t m0_inv = vsubq_u8(max_alpha, m0);
-        uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(m0));
-        uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(m0));
-        blend_u16_lo =
-            vmlal_u8(blend_u16_lo, vget_low_u8(s1), vget_low_u8(m0_inv));
-        blend_u16_hi =
-            vmlal_u8(blend_u16_hi, vget_high_u8(s1), vget_high_u8(m0_inv));
-
-        uint8x8_t blend_u8_lo =
-            vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
-        uint8x8_t blend_u8_hi =
-            vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
-        uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+        uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
 
         vst1q_u8(comp_pred_ptr, blend_u8);
 
@@ -126,17 +185,12 @@
       comp_pred += width;
     } while (--height != 0);
   } else if (width == 8) {
-    const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
-
     do {
       const uint8x8_t s0 = vld1_u8(src0);
       const uint8x8_t s1 = vld1_u8(src1);
       const uint8x8_t m0 = vld1_u8(mask);
 
-      uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
-      uint16x8_t blend_u16 = vmull_u8(s0, m0);
-      blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
-      uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+      uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
 
       vst1_u8(comp_pred, blend_u8);
 
@@ -146,7 +200,6 @@
       comp_pred += 8;
     } while (--height != 0);
   } else {
-    const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
     int h = height / 2;
     assert(width == 4);
 
@@ -155,10 +208,7 @@
       const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
       const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
 
-      uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
-      uint16x8_t blend_u16 = vmull_u8(s0, m0);
-      blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
-      uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+      uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
 
       vst1_u8(comp_pred, blend_u8);
 
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index c3ee0b7..7b1b66a 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -12,117 +12,34 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_ports/mem.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
-                            const int16x8_t v_maxval, int16x8_t *res) {
-  int32x4_t im_res_low, im_res_high;
-  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
 
-  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
-  im_res_low =
-      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+                                    uint16x8_t round_offset) {
+  const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
 
-  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
-  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
-                          vget_high_s16(src_1));
+  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+  uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
 
-  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
-                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
-}
+  blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+  blend_u32_hi =
+      vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
 
-static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
-                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
-                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
-                             int16x8_t mask3, const int16x8_t v_maxval,
-                             const uint16x8_t vec_round_offset,
-                             const int16x8_t vec_round_bits) {
-  int16x8_t src0_0, src0_1, src0_2, src0_3;
-  int16x8_t src1_0, src1_1, src1_2, src1_3;
-  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+  uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
 
-  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
-               &src0_3);
-  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
-               &src1_3);
+  uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
 
-  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
-  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
-  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
-  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+  res = vqsubq_u16(res, round_offset);
 
-  uint16x8_t im_res1_0 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
-  uint16x8_t im_res1_1 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
-  uint16x8_t im_res1_2 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
-  uint16x8_t im_res1_3 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
-
-  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
-  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
-  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
-  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
-
-  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
-  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
-  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
-  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
-}
-
-static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
-                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
-                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
-                             int16x4_t mask3, const int16x8_t v_maxval,
-                             const uint16x8_t vec_round_offset,
-                             const int16x8_t vec_round_bits) {
-  int16x8_t src0_0, src0_1;
-  int16x8_t src1_0, src1_1;
-  uint16x8_t tu0 = vdupq_n_u16(0);
-  uint16x8_t tu1 = vdupq_n_u16(0);
-  uint16x8_t tu2 = vdupq_n_u16(0);
-  uint16x8_t tu3 = vdupq_n_u16(0);
-  int16x8_t mask0_1, mask2_3;
-  int16x8_t res0, res1;
-
-  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
-  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
-
-  src0_0 = vreinterpretq_s16_u16(tu0);
-  src0_1 = vreinterpretq_s16_u16(tu1);
-
-  src1_0 = vreinterpretq_s16_u16(tu2);
-  src1_1 = vreinterpretq_s16_u16(tu3);
-
-  mask0_1 = vcombine_s16(mask0, mask1);
-  mask2_3 = vcombine_s16(mask2, mask3);
-
-  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
-  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
-
-  uint16x8_t im_res_0 =
-      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
-  uint16x8_t im_res_1 =
-      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
-
-  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
-  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
-
-  uint8x8_t res_0 = vqmovun_s16(src0_0);
-  uint8x8_t res_1 = vqmovun_s16(src0_1);
-
-  store_unaligned_u8_4x1(dst + 0 * dst_stride, res_0, 0);
-  store_unaligned_u8_4x1(dst + 1 * dst_stride, res_0, 1);
-  store_unaligned_u8_4x1(dst + 2 * dst_stride, res_1, 0);
-  store_unaligned_u8_4x1(dst + 3 * dst_stride, res_1, 1);
+  return vqrshrn_n_u16(res,
+                       2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
 }
 
 void aom_lowbd_blend_a64_d16_mask_neon(
@@ -130,19 +47,13 @@
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
     ConvolveParams *conv_params) {
-  int i = 0;
-  const int bd = 8;
-  int w_tmp = w;
-  const uint8_t *mask_tmp = mask;
-  const CONV_BUF_TYPE *src0_tmp = src0;
-  const CONV_BUF_TYPE *src1_tmp = src1;
-  uint8_t *dst_tmp = dst;
+  (void)conv_params;
 
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                           (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
 
   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
@@ -152,294 +63,430 @@
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  uint8x8_t s0 = vdup_n_u8(0);
-  uint8x8_t s1 = vdup_n_u8(0);
-  uint8x8_t s2 = vdup_n_u8(0);
-  uint8x8_t s3 = vdup_n_u8(0);
-  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
-  int16x8_t mask0, mask1, mask2, mask3;
-  int16x8_t mask4, mask5, mask6, mask7;
-  int32x4_t m0_32, m1_32, m2_32, m3_32;
-  int32x4_t m4_32, m5_32, m6_32, m7_32;
-  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
-  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
-  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
-  const uint16x4_t vec_zero = vdup_n_u16(0);
-  const uint16_t offset = round_offset - (1 << (round_bits - 1));
-  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
-  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
-  const uint16x8_t vec_offset = vdupq_n_u16(offset);
-
   if (subw == 0 && subh == 0) {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+          uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
-          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
-          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
-          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+          uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
 
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          w_tmp -= 8;
-          mask_tmp += 8;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (4 * mask_stride) - w;
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_unaligned_u8_4x4(mask_tmp, mask_stride, &s0, &s1);
+        uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 = vreinterpretq_s16_u16(vmovl_u8(s0));
-        mask1 = vreinterpretq_s16_u16(vmovl_u8(s1));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
 
-        mask0_low = vget_low_s16(mask0);
-        mask1_low = vget_high_s16(mask0);
-        mask2_low = vget_low_s16(mask1);
-        mask3_low = vget_high_s16(mask1);
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (4 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   } else if (subw == 1 && subh == 1) {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
-                       &t7);
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+          uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
-          mask1 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
-          mask2 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
-          mask3 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+          uint16x8_t m_avg =
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
 
-          mask4 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
-          mask5 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
-          mask6 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
-          mask7 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-          m0_32 = vpaddlq_s16(mask0);
-          m1_32 = vpaddlq_s16(mask1);
-          m2_32 = vpaddlq_s16(mask2);
-          m3_32 = vpaddlq_s16(mask3);
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          m4_32 = vpaddlq_s16(mask4);
-          m5_32 = vpaddlq_s16(mask5);
-          m6_32 = vpaddlq_s16(mask6);
-          m7_32 = vpaddlq_s16(mask7);
-
-          mask0 =
-              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
-          mask1 =
-              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
-          mask2 =
-              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
-          mask3 =
-              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
-
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 16;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (8 * mask_stride) - (2 * w);
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
-        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
-        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
-        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        m0_32 = vpaddlq_s16(mask0);
-        m1_32 = vpaddlq_s16(mask1);
-        m2_32 = vpaddlq_s16(mask2);
-        m3_32 = vpaddlq_s16(mask3);
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        mask0_low = vqrshrn_n_s32(m0_32, 2);
-        mask1_low = vqrshrn_n_s32(m1_32, 2);
-        mask2_low = vqrshrn_n_s32(m2_32, 2);
-        mask3_low = vqrshrn_n_s32(m3_32, 2);
-
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (8 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   } else if (subw == 1 && subh == 0) {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+          uint8x8_t m0 = vld1_u8(mask + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
-          mask1 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
-          mask2 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
-          mask3 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
-          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-          w_tmp -= 8;
-          mask_tmp += 16;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (4 * mask_stride) - (2 * w);
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                    &mask3_l);
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
-        mask1 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
-        mask2 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
-        mask3 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
-        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
-        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
-        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (4 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   } else {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
-          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
-          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
-          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
-          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 8;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (8 * mask_stride) - w;
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &s0, &s1);
-        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &s2,
-                              &s3);
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
-        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+        uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        mask0_low = vget_low_s16(mask0);
-        mask1_low = vget_high_s16(mask0);
-        mask2_low = vget_low_s16(mask1);
-        mask3_low = vget_high_s16(mask1);
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
 
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subw, int subh) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
-        i += 4;
-        mask_tmp += (8 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if ((subw | subh) == 0) {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + i);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+          i += 16;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if ((subw & subh) == 1) {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+          uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+        uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w > 8) {
+      do {
+        int i = 0;
+
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+          uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask);
+        uint8x8_t m1 = vld1_u8(mask + 8);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+          uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   }
 }
diff --git a/aom_dsp/arm/blend_neon.h b/aom_dsp/arm/blend_neon.h
new file mode 100644
index 0000000..c8a03224
--- /dev/null
+++ b/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+                                               uint8x16_t b) {
+  const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+                                             uint8x8_t b) {
+  const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint16x8_t blend_u16 = vmull_u8(m, a);
+
+  blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+  return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+                                               uint16x8_t b) {
+  uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+  uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+  blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+  blend_u32_hi =
+      vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+  uint16x4_t blend_u16_lo =
+      vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint16x4_t blend_u16_hi =
+      vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+                                               uint16x4_t b) {
+  const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u16 = vmull_u16(m, a);
+
+  blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+  return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+  return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+  return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+  return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+  return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+  uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+  uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+  return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif  // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+                                                  uint8x8_t c, uint8x8_t d) {
+  uint8x8_t a_c = vpadd_u8(a, c);
+  uint8x8_t b_d = vpadd_u8(b, d);
+  return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+                                                    uint8x16_t c,
+                                                    uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+  uint8x16_t a_c = vpaddq_u8(a, c);
+  uint8x16_t b_d = vpaddq_u8(b, d);
+  return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+  uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+  uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+  uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+  uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+  uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+  uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+  return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif  // AOM_ARCH_AARCH64
+}
+
+#endif  // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/aom_dsp/arm/blk_sse_sum_neon.c b/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 0000000..f2ada93
--- /dev/null
+++ b/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+                                            int bh, int *x_sum,
+                                            int64_t *x2_sum) {
+  int i = bh;
+  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sse = vdupq_n_s32(0);
+
+  do {
+    int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+    sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+    data += 2 * stride;
+    i -= 2;
+  } while (i != 0);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+                                            int bh, int *x_sum,
+                                            int64_t *x2_sum) {
+  int i = bh;
+  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sse = vdupq_n_s32(0);
+
+  // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+  // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+  // value for bh is 32, we don't have to worry about sse overflowing.
+
+  do {
+    int16x8_t d = vld1q_s16(data);
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+    sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+    data += stride;
+  } while (--i != 0);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+                                              int bw, int bh, int *x_sum,
+                                              int64_t *x2_sum) {
+  int32x4_t sum = vdupq_n_s32(0);
+  int64x2_t sse = vdupq_n_s64(0);
+
+  // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+  // 32-bits element. Since we're accumulating into an int32x4_t vector that
+  // means we can process up to (127*4)/bw rows before we need to widen to
+  // 64 bits.
+
+  int i_limit = (127 * 4) / bw;
+  int i_tmp = bh > i_limit ? i_limit : bh;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32 = vdupq_n_s32(0);
+    do {
+      int j = bw;
+      const int16_t *data_ptr = data;
+      do {
+        int16x8_t d = vld1q_s16(data_ptr);
+
+        sum = vpadalq_s16(sum, d);
+
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+        data_ptr += 8;
+        j -= 8;
+      } while (j != 0);
+
+      data += stride;
+      i++;
+    } while (i < i_tmp && i < bh);
+
+    sse = vpadalq_s32(sse, sse_s32);
+    i_tmp += i_limit;
+  } while (i < bh);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  if (bw == 4) {
+    get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+  } else if (bw == 8) {
+    get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+  } else {
+    assert(bw % 8 == 0);
+    get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/aom_dsp/arm/dist_wtd_avg_neon.h b/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 0000000..19c9b04
--- /dev/null
+++ b/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+                                          uint8x8_t wta, uint8x8_t wtb) {
+  uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+  wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+  return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+                                            uint16x4_t wta, uint16x4_t wtb) {
+  uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+  wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+  return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+                                            uint8x16_t wta, uint8x16_t wtb) {
+  uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+  uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+  wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+  wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+  uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+  uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+  return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+                                            uint16x8_t wta, uint16x8_t wtb) {
+  uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+  uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+  wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+  wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+  uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+  uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+  return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif  // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index a7d66b3..fb4cda7 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -48,8 +48,8 @@
     // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
     const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
     const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
 
     // fdct_round_shift
     int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -57,11 +57,13 @@
 
     // s_3 * cospi_8_64 + s_2 * cospi_24_64
     // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
 
-    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+    const int32x4_t temp3 =
+        vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+    const int32x4_t temp4 =
+        vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
 
     // fdct_round_shift
     int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
@@ -69,7 +71,7 @@
 
     // Only transpose the first pass
     if (i == 0) {
-      transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+      transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
     }
 
     *input_0 = out_0;
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 82ce0cd..d0f5922 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -37,7 +37,7 @@
 
   hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
 
-  transpose_s16_4x4d(&a0, &a1, &a2, &a3);
+  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
 
   hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
 
@@ -91,7 +91,7 @@
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
-  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
@@ -120,7 +120,7 @@
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
-  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
@@ -196,56 +196,90 @@
   /* Bottom right. */
   aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
 
+  // Each iteration of the loop operates on entire rows (16 samples each)
+  // because we need to swap the second and third quarters of every row in the
+  // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+  // loop at the end of aom_hadamard_16x16_c.
   for (int i = 0; i < 64; i += 16) {
-    const int16x8_t a00 = load_tran_low_to_s16q(coeff + 0);
-    const int16x8_t a01 = load_tran_low_to_s16q(coeff + 64);
-    const int16x8_t a02 = load_tran_low_to_s16q(coeff + 128);
-    const int16x8_t a03 = load_tran_low_to_s16q(coeff + 192);
+    const int32x4_t a00 = vld1q_s32(coeff + 0);
+    const int32x4_t a01 = vld1q_s32(coeff + 64);
+    const int32x4_t a02 = vld1q_s32(coeff + 128);
+    const int32x4_t a03 = vld1q_s32(coeff + 192);
 
-    const int16x8_t b00 = vhaddq_s16(a00, a01);
-    const int16x8_t b01 = vhsubq_s16(a00, a01);
-    const int16x8_t b02 = vhaddq_s16(a02, a03);
-    const int16x8_t b03 = vhsubq_s16(a02, a03);
+    const int32x4_t b00 = vhaddq_s32(a00, a01);
+    const int32x4_t b01 = vhsubq_s32(a00, a01);
+    const int32x4_t b02 = vhaddq_s32(a02, a03);
+    const int32x4_t b03 = vhsubq_s32(a02, a03);
 
-    const int16x8_t c00 = vaddq_s16(b00, b02);
-    const int16x8_t c01 = vaddq_s16(b01, b03);
-    const int16x8_t c02 = vsubq_s16(b00, b02);
-    const int16x8_t c03 = vsubq_s16(b01, b03);
+    const int32x4_t c00 = vaddq_s32(b00, b02);
+    const int32x4_t c01 = vaddq_s32(b01, b03);
+    const int32x4_t c02 = vsubq_s32(b00, b02);
+    const int32x4_t c03 = vsubq_s32(b01, b03);
 
-    const int16x8_t a10 = load_tran_low_to_s16q(coeff + 8 + 0);
-    const int16x8_t a11 = load_tran_low_to_s16q(coeff + 8 + 64);
-    const int16x8_t a12 = load_tran_low_to_s16q(coeff + 8 + 128);
-    const int16x8_t a13 = load_tran_low_to_s16q(coeff + 8 + 192);
+    const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+    const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+    const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+    const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
 
-    const int16x8_t b10 = vhaddq_s16(a10, a11);
-    const int16x8_t b11 = vhsubq_s16(a10, a11);
-    const int16x8_t b12 = vhaddq_s16(a12, a13);
-    const int16x8_t b13 = vhsubq_s16(a12, a13);
+    const int32x4_t b10 = vhaddq_s32(a10, a11);
+    const int32x4_t b11 = vhsubq_s32(a10, a11);
+    const int32x4_t b12 = vhaddq_s32(a12, a13);
+    const int32x4_t b13 = vhsubq_s32(a12, a13);
 
-    const int16x8_t c10 = vaddq_s16(b10, b12);
-    const int16x8_t c11 = vaddq_s16(b11, b13);
-    const int16x8_t c12 = vsubq_s16(b10, b12);
-    const int16x8_t c13 = vsubq_s16(b11, b13);
+    const int32x4_t c10 = vaddq_s32(b10, b12);
+    const int32x4_t c11 = vaddq_s32(b11, b13);
+    const int32x4_t c12 = vsubq_s32(b10, b12);
+    const int32x4_t c13 = vsubq_s32(b11, b13);
 
-    store_s16_to_tran_low(coeff + 0 + 0, vget_low_s16(c00));
-    store_s16_to_tran_low(coeff + 0 + 4, vget_low_s16(c10));
-    store_s16_to_tran_low(coeff + 0 + 8, vget_high_s16(c00));
-    store_s16_to_tran_low(coeff + 0 + 12, vget_high_s16(c10));
+    const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+    const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+    const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+    const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
 
-    store_s16_to_tran_low(coeff + 64 + 0, vget_low_s16(c01));
-    store_s16_to_tran_low(coeff + 64 + 4, vget_low_s16(c11));
-    store_s16_to_tran_low(coeff + 64 + 8, vget_high_s16(c01));
-    store_s16_to_tran_low(coeff + 64 + 12, vget_high_s16(c11));
+    const int32x4_t b20 = vhaddq_s32(a20, a21);
+    const int32x4_t b21 = vhsubq_s32(a20, a21);
+    const int32x4_t b22 = vhaddq_s32(a22, a23);
+    const int32x4_t b23 = vhsubq_s32(a22, a23);
 
-    store_s16_to_tran_low(coeff + 128 + 0, vget_low_s16(c02));
-    store_s16_to_tran_low(coeff + 128 + 4, vget_low_s16(c12));
-    store_s16_to_tran_low(coeff + 128 + 8, vget_high_s16(c02));
-    store_s16_to_tran_low(coeff + 128 + 12, vget_high_s16(c12));
+    const int32x4_t c20 = vaddq_s32(b20, b22);
+    const int32x4_t c21 = vaddq_s32(b21, b23);
+    const int32x4_t c22 = vsubq_s32(b20, b22);
+    const int32x4_t c23 = vsubq_s32(b21, b23);
 
-    store_s16_to_tran_low(coeff + 192 + 0, vget_low_s16(c03));
-    store_s16_to_tran_low(coeff + 192 + 4, vget_low_s16(c13));
-    store_s16_to_tran_low(coeff + 192 + 8, vget_high_s16(c03));
-    store_s16_to_tran_low(coeff + 192 + 12, vget_high_s16(c13));
+    const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+    const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+    const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+    const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+    const int32x4_t b30 = vhaddq_s32(a30, a31);
+    const int32x4_t b31 = vhsubq_s32(a30, a31);
+    const int32x4_t b32 = vhaddq_s32(a32, a33);
+    const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+    const int32x4_t c30 = vaddq_s32(b30, b32);
+    const int32x4_t c31 = vaddq_s32(b31, b33);
+    const int32x4_t c32 = vsubq_s32(b30, b32);
+    const int32x4_t c33 = vsubq_s32(b31, b33);
+
+    vst1q_s32(coeff + 0 + 0, c00);
+    vst1q_s32(coeff + 0 + 4, c20);
+    vst1q_s32(coeff + 0 + 8, c10);
+    vst1q_s32(coeff + 0 + 12, c30);
+
+    vst1q_s32(coeff + 64 + 0, c01);
+    vst1q_s32(coeff + 64 + 4, c21);
+    vst1q_s32(coeff + 64 + 8, c11);
+    vst1q_s32(coeff + 64 + 12, c31);
+
+    vst1q_s32(coeff + 128 + 0, c02);
+    vst1q_s32(coeff + 128 + 4, c22);
+    vst1q_s32(coeff + 128 + 8, c12);
+    vst1q_s32(coeff + 128 + 12, c32);
+
+    vst1q_s32(coeff + 192 + 0, c03);
+    vst1q_s32(coeff + 192 + 4, c23);
+    vst1q_s32(coeff + 192 + 8, c13);
+    vst1q_s32(coeff + 192 + 12, c33);
 
     coeff += 16;
   }
diff --git a/aom_dsp/arm/highbd_avg_pred_neon.c b/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000..531309b
--- /dev/null
+++ b/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int src_stride0 = invert_mask ? width : ref_stride;
+  const int src_stride1 = invert_mask ? ref_stride : width;
+
+  if (width >= 8) {
+    do {
+      int j = 0;
+
+      do {
+        const uint16x8_t s0 = vld1q_u16(src0 + j);
+        const uint16x8_t s1 = vld1q_u16(src1 + j);
+        const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+        uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(comp_pred + j, blend_u16);
+
+        j += 8;
+      } while (j < width);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += width;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+
+    do {
+      const uint16x4_t s0 = vld1_u16(src0);
+      const uint16x4_t s1 = vld1_u16(src1);
+      const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+      uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      vst1_u16(comp_pred, blend_u16);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += 4;
+    } while (--height != 0);
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        const uint16x8_t avg =
+            dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      const uint16x8_t avg =
+          dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      const uint16x4_t avg = dist_wtd_avg_u16x4(
+          r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 0000000..bdd2177
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                     const uint8_t *src0_8,
+                                     uint32_t src0_stride,
+                                     const uint8_t *src1_8,
+                                     uint32_t src1_stride, const uint8_t *mask,
+                                     int w, int h, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (w >= 8) {
+    do {
+      int i = 0;
+      do {
+        uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+        uint16x8_t s0 = vld1q_u16(src0 + i);
+        uint16x8_t s1 = vld1q_u16(src1 + i);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(dst + i, blend);
+        i += 8;
+      } while (i < w);
+
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else if (w == 4) {
+    const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+    do {
+      uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+      uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+      store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 8) {
+    const uint16x4_t m0 =
+        vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+    do {
+      uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+      uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+      uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 0000000..36d763a
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits)                               \
+  static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8(                  \
+      uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) {     \
+    const uint16x8_t m_inv =                                                  \
+        vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);                   \
+                                                                              \
+    uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset),  \
+                                        vget_low_u16(m), vget_low_u16(a));    \
+    uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset),  \
+                                        vget_high_u16(m), vget_high_u16(a));  \
+                                                                              \
+    blend_u32_lo =                                                            \
+        vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));        \
+    blend_u32_hi =                                                            \
+        vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));      \
+                                                                              \
+    uint16x4_t blend_u16_lo =                                                 \
+        vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo),                   \
+                       AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS -           \
+                           round0_bits - COMPOUND_ROUND1_BITS);               \
+    uint16x4_t blend_u16_hi =                                                 \
+        vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi),                   \
+                       AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS -           \
+                           round0_bits - COMPOUND_ROUND1_BITS);               \
+                                                                              \
+    uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi);          \
+    blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1));             \
+                                                                              \
+    return blend_u16;                                                         \
+  }                                                                           \
+                                                                              \
+  static INLINE void highbd_##bd##_blend_a64_d16_mask_neon(                   \
+      uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,          \
+      uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  \
+      const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw,      \
+      int subh) {                                                             \
+    const int offset_bits = bd + 2 * FILTER_BITS - round0_bits;               \
+    int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +      \
+                           (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));   \
+    int32x4_t offset =                                                        \
+        vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS));             \
+                                                                              \
+    if ((subw | subh) == 0) {                                                 \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));                      \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);         \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += mask_stride;                                                \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);           \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else if ((subw & subh) == 1) {                                          \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);         \
+            uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);         \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(            \
+                vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0),           \
+                vget_high_u8(m1)));                                           \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);                     \
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);                     \
+          uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);                     \
+          uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);                     \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg =                                                  \
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));            \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 4 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else if (subw == 1 && subh == 0) {                                      \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x8_t m0 = vld1_u8(mask + 2 * i);                             \
+            uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);                         \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));     \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += mask_stride;                                                \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);                     \
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);                     \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));       \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else {                                                                  \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);               \
+            uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);               \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));              \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0_2 =                                                    \
+              load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+          uint8x8_t m1_3 =                                                    \
+              load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));            \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 4 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  (void)conv_params;
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  if (bd == 12) {
+    highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, mask, mask_stride, w, h,
+                                      subw, subh);
+  } else if (bd == 10) {
+    highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, mask, mask_stride, w, h,
+                                      subw, subh);
+  } else {
+    highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h, subw,
+                                     subh);
+  }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                    const uint8_t *src0_8, uint32_t src0_stride,
+                                    const uint8_t *src1_8, uint32_t src1_stride,
+                                    const uint8_t *mask, uint32_t mask_stride,
+                                    int w, int h, int subw, int subh, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if ((subw | subh) == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if ((subw & subh) == 1) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+          uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg =
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 0000000..ea3d655
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                     const uint8_t *src0_8,
+                                     uint32_t src0_stride,
+                                     const uint8_t *src1_8,
+                                     uint32_t src1_stride, const uint8_t *mask,
+                                     int w, int h, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (w >= 8) {
+    do {
+      uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+      int i = 0;
+      do {
+        uint16x8_t s0 = vld1q_u16(src0 + i);
+        uint16x8_t s1 = vld1q_u16(src1 + i);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+        vst1q_u16(dst + i, blend);
+        i += 8;
+      } while (i < w);
+
+      mask += 1;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else if (w == 4) {
+    do {
+      uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+      uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+      uint16x8_t m = vcombine_u16(m1, m2);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+      uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+      store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 8) {
+    do {
+      uint16x4_t m0 = vdup_n_u16(0);
+      m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+      uint8x8_t m0_zip =
+          vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+      m0 = vget_low_u16(vmovl_u8(m0_zip));
+      uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+      uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+      uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  }
+}
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 0000000..e25438c
--- /dev/null
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+  return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  int32x4_t sum =
+      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  const int16x8_t s2 = vextq_s16(s0, s1, 1);
+  const int16x8_t s3 = vextq_s16(s0, s1, 2);
+  const int16x8_t s4 = vextq_s16(s0, s1, 3);
+  const int16x4_t s0_lo = vget_low_s16(s0);
+  const int16x4_t s1_lo = vget_low_s16(s2);
+  const int16x4_t s2_lo = vget_low_s16(s3);
+  const int16x4_t s3_lo = vget_low_s16(s4);
+  const int16x4_t s4_lo = vget_high_s16(s0);
+  const int16x4_t s5_lo = vget_high_s16(s2);
+  const int16x4_t s6_lo = vget_high_s16(s3);
+  const int16x4_t s7_lo = vget_high_s16(s4);
+
+  return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+                                s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    int32x4_t *sum0, int32x4_t *sum1) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+                                               const int16x8_t s0_hi,
+                                               const int16x8_t x_filter_0_7,
+                                               int32x4_t *sum0,
+                                               int32x4_t *sum1) {
+  const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+  const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+  const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+  const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+  const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+  const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+  const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+                         sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  int32x4_t sum0, sum1;
+  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+  int32x4_t sum0;
+  int32x4_t sum1;
+  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+                         &sum1);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+                                       ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       const int16_t *x_filter_ptr,
+                                       int x_step_q4, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x2(s, src_stride, &s0, &s2);
+      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+      uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+      uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+      uint16x8_t d01 = vcombine_u16(d0, d1);
+      d01 = vminq_u16(d01, max);
+
+      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+      s += 2 * src_stride;
+      d += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+      int x_q4 = 0;
+
+      const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+      int16x8_t s0, s2, s4, s6;
+      load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+      src_x += 8;
+
+      do {
+        int16x8_t s1, s3, s5, s7;
+        load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+        uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+        uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+        uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+        uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s1;
+        s2 = s3;
+        s4 = s5;
+        s6 = s7;
+        src_x += 8;
+        d += 8;
+        width -= 8;
+        x_q4 += 8 * x_step_q4;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    (void)filter_y;
+    (void)y_step_q4;
+
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    src -= SUBPEL_TAPS / 2 - 1;
+    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+                               x_step_q4, w, h, bd);
+  }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+                                      ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      const int16_t *y_filter_ptr, int w, int h,
+                                      int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      uint16x4_t d1 =
+          highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      uint16x4_t d2 =
+          highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      uint16x4_t d3 =
+          highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      uint16x8_t d01 = vcombine_u16(d0, d1);
+      uint16x8_t d23 = vcombine_u16(d2, d3);
+
+      d01 = vminq_u16(d01, max);
+      d23 = vminq_u16(d23, max);
+
+      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+      vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+      vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+                                                   s7, y_filter);
+        uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+                                                   s8, y_filter);
+        uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+                                                   s9, y_filter);
+        uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+                                                   s10, y_filter);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                    uint8_t *dst8, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    (void)filter_x;
+    (void)x_step_q4;
+
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+    highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+                              bd);
+  }
+}
diff --git a/aom_dsp/arm/highbd_hadamard_neon.c b/aom_dsp/arm/highbd_hadamard_neon.c
index aad2046..d28617c 100644
--- a/aom_dsp/arm/highbd_hadamard_neon.c
+++ b/aom_dsp/arm/highbd_hadamard_neon.c
@@ -109,7 +109,7 @@
   // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
   hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
 
-  transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
 
   // For the second pass we need to widen to 32-bit elements, so we're
   // processing 4 columns at a time.
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 63f53c3..366ca3f 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -15,6 +15,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/intrapred_common.h"
 
 // -----------------------------------------------------------------------------
@@ -191,7 +192,7 @@
     uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above);       \
     uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left);         \
     uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above);                \
-    int sum = horizontal_add_and_broadcast_long_u16x8(sum_vec)[0];      \
+    int sum = horizontal_add_u16x8(sum_vec);                            \
     int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
     highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0));    \
   }
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 2b5128e..77727b7 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -298,7 +298,7 @@
 
   uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
                         vld1_u16(dst_q1) };
-  transpose_u16_4x4(src);
+  transpose_array_inplace_u16_4x4(src);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -344,7 +344,7 @@
     vget_high_u16(p0q0_output),
     vget_high_u16(p1q1_output),
   };
-  transpose_u16_4x4(output);
+  transpose_array_inplace_u16_4x4(output);
 
   vst1_u16(dst_p1, output[0]);
   vst1_u16(dst_p0, output[1]);
@@ -386,7 +386,7 @@
   // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
   //        ^^^^^^                          ^^^^^^
   // Should dual issue with the left shift.
-  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
   sum = vaddq_u16(sum, outer_sum);
 
@@ -401,7 +401,7 @@
   // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
   //        ^^^^^^^^
   sum = vsubq_u16(sum, p2q2_double);
-  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
 
   *p0q0_output = vrshrq_n_u16(sum, 3);
@@ -505,7 +505,7 @@
   // and src_raw[3] after transpose.
   uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
                             vld1q_u16(dst_2), vld1q_u16(dst_3) };
-  transpose_u16_4x8q(src_raw);
+  transpose_array_inplace_u16_4x8(src_raw);
   // p2, p1, p0, q0, q1, q2
   const uint16x4_t src[6] = {
     vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
@@ -574,7 +574,7 @@
     vget_high_u16(p0q0_output),
     vget_high_u16(p1q1_output),
   };
-  transpose_u16_4x4(output);
+  transpose_array_inplace_u16_4x4(output);
 
   // dst_n starts at p2, so adjust to p1.
   vst1_u16(dst_0 + 1, output[0]);
@@ -626,7 +626,7 @@
 
   // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
   //                                               ^^^^^^
-  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   sum = vaddq_u16(sum, q0p0);
 
   *p2q2_output = vrshrq_n_u16(sum, 3);
@@ -635,7 +635,7 @@
   // p1 = p2 - p3 - p2 + p1 + q1
   // q1 = q2 - q3 - q2 + q0 + p1
   sum = vsubq_u16(sum, p23q23);
-  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
 
   *p1q1_output = vrshrq_n_u16(sum, 3);
@@ -644,7 +644,7 @@
   // p0 = p1 - p3 - p1 + p0 + q2
   // q0 = q1 - q3 - q1 + q0 + p2
   sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
-  const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+  const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
   sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
 
   *p0q0_output = vrshrq_n_u16(sum, 3);
@@ -827,7 +827,7 @@
   uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
   // After transpose, |output| will contain rows of the form:
   // p0 p1 p2 p3 q0 q1 q2 q3
-  transpose_u16_4x8q(output);
+  transpose_array_inplace_u16_4x8(output);
 
   // Reverse p values to produce original order:
   // p3 p2 p1 p0 q0 q1 q2 q3
@@ -883,7 +883,7 @@
   //                                                           ^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //      ^^
-  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   sum = vaddq_u16(sum, q0p0);
 
   *p5q5_output = vrshrq_n_u16(sum, 4);
@@ -892,7 +892,7 @@
   // p4 = p5 - (2 * p6) + p3 + q1
   // q4 = q5 - (2 * q6) + q3 + p1
   sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
-  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
 
   *p4q4_output = vrshrq_n_u16(sum, 4);
@@ -901,7 +901,7 @@
   // p3 = p4 - p6 - p5 + p2 + q2
   // q3 = q4 - q6 - q5 + q2 + p2
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
-  const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+  const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
   sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
 
   *p3q3_output = vrshrq_n_u16(sum, 4);
@@ -910,7 +910,7 @@
   // p2 = p3 - p6 - p4 + p1 + q3
   // q2 = q3 - q6 - q4 + q1 + p3
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
-  const uint16x8_t q3p3 = transpose64_u16q(p3q3);
+  const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
   sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
 
   *p2q2_output = vrshrq_n_u16(sum, 4);
@@ -919,7 +919,7 @@
   // p1 = p2 - p6 - p3 + p0 + q4
   // q1 = q2 - q6 - q3 + q0 + p4
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
-  const uint16x8_t q4p4 = transpose64_u16q(p4q4);
+  const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
   sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
 
   *p1q1_output = vrshrq_n_u16(sum, 4);
@@ -928,7 +928,7 @@
   // p0 = p1 - p6 - p2 + q0 + q5
   // q0 = q1 - q6 - q2 + p0 + p5
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
-  const uint16x8_t q5p5 = transpose64_u16q(p5q5);
+  const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
   sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
 
   *p0q0_output = vrshrq_n_u16(sum, 4);
@@ -1118,14 +1118,14 @@
   uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
                           vld1q_u16(dst_3) };
   // p7 will be the low half of src_p[0]. Not used until the end.
-  transpose_u16_4x8q(src_p);
+  transpose_array_inplace_u16_4x8(src_p);
 
   // Low halves:  q0 q1 q2 q3
   // High halves: q4 q5 q6 q7
   uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
                           vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
   // q7 will be the high half of src_q[3]. Not used until the end.
-  transpose_u16_4x8q(src_q);
+  transpose_array_inplace_u16_4x8(src_q);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -1238,10 +1238,10 @@
   const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
   uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
                              p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
-  transpose_u16_4x8q(output_p);
+  transpose_array_inplace_u16_4x8(output_p);
   uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
                              p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
-  transpose_u16_4x8q(output_q);
+  transpose_array_inplace_u16_4x8(output_q);
 
   // Reverse p values to produce original order:
   // p3 p2 p1 p0 q0 q1 q2 q3
diff --git a/aom_dsp/arm/highbd_masked_sad_neon.c b/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 0000000..9262d81
--- /dev/null
+++ b/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+                                             const uint16_t *src,
+                                             const uint16_t *a,
+                                             const uint16_t *b,
+                                             const uint8_t *m) {
+  const uint16x8_t s0 = vld1q_u16(src);
+  const uint16x8_t a0 = vld1q_u16(a);
+  const uint16x8_t b0 = vld1q_u16(b);
+  const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+  uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+  return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+                                              const uint16_t *src,
+                                              const uint16_t *a,
+                                              const uint16_t *b,
+                                              const uint8_t *m) {
+  sad = masked_sad_8x1_neon(sad, src, a, b, m);
+  return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+                                              const uint16_t *src,
+                                              const uint16_t *a,
+                                              const uint16_t *b,
+                                              const uint8_t *m) {
+  sad = masked_sad_16x1_neon(sad, src, a, b, m);
+  return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  do {
+    uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                         vdupq_n_u16(0) };
+    for (int h = 0; h < 4; ++h) {
+      sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+      sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+      sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+      sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+    sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+    sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+    sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+    height -= 4;
+  } while (height != 0);
+
+  sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+  sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+  sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+  return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+    for (int h = 0; h < 4; ++h) {
+      sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+      sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+    sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+    height -= 4;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad = vdupq_n_u16(0);
+    for (int h = 0; h < 4; ++h) {
+      sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad);
+    height -= 4;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+    for (int h = 0; h < 8; ++h) {
+      sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+    height -= 8;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+    for (int h = 0; h < 16; ++h) {
+      sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+    height -= 16;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 128 elements in the
+  // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 128 elements in the
+  // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 16);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 64 elements in the
+  // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 16);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  uint16x4_t sad = vdup_n_u16(0);
+  do {
+    uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+    uint16x4_t a0 = load_unaligned_u16_4x1(a);
+    uint16x4_t b0 = load_unaligned_u16_4x1(b);
+    uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+    uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+    sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h)                                \
+  unsigned int aom_highbd_masked_sad##w##x##h##_neon(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride,  \
+                                           second_pred, w, msk, msk_stride,   \
+                                           h);                                \
+    else                                                                      \
+      return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w,   \
+                                           ref, ref_stride, msk, msk_stride,  \
+                                           h);                                \
+  }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h)                                \
+  unsigned int aom_highbd_masked_sad##w##x##h##_neon(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride,  \
+                                           second_pred, w, msk, msk_stride,   \
+                                           h);                                \
+    else                                                                      \
+      return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w,   \
+                                           ref, ref_stride, msk, msk_stride,  \
+                                           h);                                \
+  }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_sad_neon.c b/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 0000000..28699e6
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+                                                const int32_t *mask,
+                                                const int32_t *wsrc,
+                                                uint32x4_t *sum) {
+  int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+  int32x4_t wsrc_lo = vld1q_s32(wsrc);
+  int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+  int32x4_t mask_lo = vld1q_s32(mask);
+  int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+  int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+  int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+  uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+  uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+  *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+  *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+                                                    int ref_stride,
+                                                    const int32_t *wsrc,
+                                                    const int32_t *mask,
+                                                    int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int h = height / 2;
+  do {
+    uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+    ref_ptr += 2 * ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+                                                    int ref_stride,
+                                                    const int32_t *wsrc,
+                                                    const int32_t *mask,
+                                                    int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t r = vld1q_u16(ref_ptr);
+
+    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+    ref_ptr += ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--height != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+                                                      int ref_stride,
+                                                      const int32_t *wsrc,
+                                                      const int32_t *mask,
+                                                      int width, int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    int i = 0;
+    do {
+      uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+      highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+      uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+      highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+      wsrc += 16;
+      mask += 16;
+      i += 16;
+    } while (i < width);
+
+    ref_ptr += ref_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int height) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+  do {
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+    uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+    uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+    highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+    highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+    highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+    highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+    wsrc += 32;
+    mask += 32;
+    ref_ptr += ref_stride;
+  } while (--height != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+                                                      int ref_stride,
+                                                      const int32_t *wsrc,
+                                                      const int32_t *mask,
+                                                      int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h)                                   \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_neon(                      \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc,           \
+      const int32_t *mask) {                                             \
+    return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+  }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_variance_neon.c b/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 0000000..d592246
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     uint32x4_t *sse,
+                                                     int32x4_t *sum) {
+  int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+  int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+  int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+  int32x4_t mask_lo = vld1q_s32(&mask[0]);
+  int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+  int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+  int32x4_t diff_hi =
+      vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+  diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+  diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+  // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+  // This difference only affects the bit patterns at the rounding breakpoints
+  // exactly, so we can add -1 to all negative numbers to move the breakpoint
+  // one value across and into the correct rounding region.
+  diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+  diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+  int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+  int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+  *sum = vaddq_s32(*sum, round_lo);
+  *sum = vaddq_s32(*sum, round_hi);
+  *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+                   vreinterpretq_u32_s32(round_lo));
+  *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+                   vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+    int64_t *sum) {
+  uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  do {
+    uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+    int j = 0;
+
+    do {
+      int i = 0;
+
+      do {
+        uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+        highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+                                          &sum_s32);
+
+        uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+        highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+                                          &sum_s32);
+
+        i += 16;
+        wsrc += 16;
+        mask += 16;
+      } while (i < width);
+
+      pre_ptr += pre_stride;
+      j++;
+    } while (j < h_tmp);
+
+    sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+    sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+    h -= h_tmp;
+  } while (h != 0);
+
+  *sse = horizontal_add_u64x2(sse_u64);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+  uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    int i = 0;
+    do {
+      uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+      highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+      uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+      highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+                                        &sum_s32);
+
+      i += 16;
+      wsrc += 16;
+      mask += 16;
+    } while (i < width);
+
+    pre_ptr += pre_stride;
+  } while (--h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+                                  sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+                                                 int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask, int h,
+                                                 uint64_t *sse, int64_t *sum) {
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    uint16x8_t pre_u16 = vld1q_u16(pre);
+
+    highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+    pre += pre_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+                                                 int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask, int h,
+                                                 uint64_t *sse, int64_t *sum) {
+  assert(h % 2 == 0);
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+    highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+    pre += 2 * pre_stride;
+    wsrc += 8;
+    mask += 8;
+    h -= 2;
+  } while (h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                               int *sum, unsigned int *sse) {
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                                int *sum, unsigned int *sse) {
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                                int *sum, unsigned int *sse) {
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth)                         \
+  unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(         \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,                \
+      const int32_t *mask, unsigned int *sse) {                               \
+    int sum;                                                                  \
+    int64_t sum64;                                                            \
+    uint64_t sse64;                                                           \
+    highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+                                      &sum64);                                \
+    highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse);          \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h));             \
+  }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth)                 \
+  unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(        \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,               \
+      const int32_t *mask, unsigned int *sse) {                              \
+    int sum;                                                                 \
+    int64_t sum64;                                                           \
+    uint64_t sse64;                                                          \
+    highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+                                             &sse64, &sum64);                \
+    highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse);         \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h));            \
+  }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index 77a7aac..6149c9f 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -11,14 +11,11 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <string.h>
 
 #include "config/aom_config.h"
 
 #include "aom_dsp/quantize.h"
-#include "aom_dsp/arm/mem_neon.h"
-
-#include "av1/common/quant_common.h"
-#include "av1/encoder/av1_quantize.h"
 
 static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
 #if AOM_ARCH_AARCH64
@@ -83,6 +80,7 @@
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE void get_min_max_lane_eob(const int16_t *iscan,
                                         int16x8_t *v_eobmin,
                                         int16x8_t *v_eobmax, uint16x8_t v_mask,
@@ -91,13 +89,14 @@
   const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
 #if SKIP_EOB_FACTOR_ADJUST
   const int16x8_t v_nz_iscan_min =
-      vbslq_s16(v_mask, v_iscan, vdupq_n_s16(n_coeffs));
+      vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
   *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
 #else
   (void)v_eobmin;
 #endif
   *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
@@ -117,6 +116,7 @@
 #endif
 }
 
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
 static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
 #if AOM_ARCH_AARCH64
   return (uint16_t)vminvq_s16(v_eobmin);
@@ -134,6 +134,7 @@
   return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
 #endif
 }
+#endif  // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
 
 static void highbd_quantize_b_neon(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
@@ -298,7 +299,7 @@
   int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-  int16x8_t v_eobmin = vdupq_n_s16(n_coeffs);
+  int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
 
   assert(n_coeffs > 8);
   // Pre-scan pass
diff --git a/aom_dsp/arm/highbd_sad4d_neon.c b/aom_dsp/arm/highbd_sad4d_neon.c
deleted file mode 100644
index f2fda36..0000000
--- a/aom_dsp/arm/highbd_sad4d_neon.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/sum_neon.h"
-
-static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  int i = 0;
-  do {
-    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
-    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
-    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
-    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
-    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
-
-    sum[0] = vabal_u16(sum[0], s, r0);
-    sum[1] = vabal_u16(sum[1], s, r1);
-    sum[2] = vabal_u16(sum[2], s, r2);
-    sum[3] = vabal_u16(sum[3], s, r3);
-
-  } while (++i < h);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-  uint32x4_t sum_u32[4];
-
-  int i = 0;
-  do {
-    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
-
-    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
-    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
-    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
-    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
-
-  } while (++i < h);
-
-  sum_u32[0] = vpaddlq_u16(sum[0]);
-  sum_u32[1] = vpaddlq_u16(sum[1]);
-  sum_u32[2] = vpaddlq_u16(sum[2]);
-  sum_u32[3] = vpaddlq_u16(sum[3]);
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
-}
-
-static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
-                             uint32x4_t *const sad_sum) {
-  uint16x8_t abs_diff = vabdq_u16(src, ref);
-  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
-}
-
-static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  int i = 0;
-  do {
-    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
-    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
-    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
-    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
-    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
-
-  } while (++i < h);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
-                                                int src_stride,
-                                                const uint8_t *const ref_ptr[4],
-                                                int ref_stride, uint32_t res[4],
-                                                int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int i = 0;
-  do {
-    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
-    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
-    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
-    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
-    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
-
-    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
-    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
-    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
-    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
-    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
-
-  } while (++i < h);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int w, int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int i = 0;
-  do {
-    int j = 0;
-    do {
-      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
-      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
-      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
-      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
-      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
-
-      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
-      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
-      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
-      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
-      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
-
-      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
-      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
-                &sum_lo[0]);
-      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
-                &sum_lo[1]);
-      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
-                &sum_lo[2]);
-      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
-                &sum_lo[3]);
-
-      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
-      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
-                &sum_hi[0]);
-      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
-                &sum_hi[1]);
-      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
-                &sum_hi[2]);
-      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
-                &sum_hi[3]);
-
-      j += 32;
-    } while (j < w);
-
-  } while (++i < h);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad128xhx4d_large_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
-    int ref_stride, uint32_t res[4], int h) {
-  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
-                              128, h);
-}
-
-static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
-                                                int src_stride,
-                                                const uint8_t *const ref_ptr[4],
-                                                int ref_stride, uint32_t res[4],
-                                                int h) {
-  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
-                              h);
-}
-
-static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
-                                                int src_stride,
-                                                const uint8_t *const ref_ptr[4],
-                                                int ref_stride, uint32_t res[4],
-                                                int h) {
-  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
-                              h);
-}
-
-#define HBD_SAD_WXH_4D_SMALL_NEON(w, h)                                      \
-  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride,  \
-                                    sad_array, (h));                         \
-  }
-
-#define HBD_SAD_WXH_4D_LARGE_NEON(w, h)                                      \
-  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride,  \
-                                    sad_array, (h));                         \
-  }
-
-HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
-HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
-
-HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
-HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
-HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
-
-HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
-
-HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
-HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
-HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
-
-HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
-HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
-HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
-
-HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
-HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
-
-#if !CONFIG_REALTIME_ONLY
-HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
-
-HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
-
-HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
-
-HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
-
-HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
-#endif  // !CONFIG_REALTIME_ONLY
-
-#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h)                                 \
-  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array,          \
-                                    2 * ref_stride, sad_array, ((h) >> 1));  \
-    sad_array[0] <<= 1;                                                      \
-    sad_array[1] <<= 1;                                                      \
-    sad_array[2] <<= 1;                                                      \
-    sad_array[3] <<= 1;                                                      \
-  }
-
-#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h)                                 \
-  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array,          \
-                                    2 * ref_stride, sad_array, ((h) >> 1));  \
-    sad_array[0] <<= 1;                                                      \
-    sad_array[1] <<= 1;                                                      \
-    sad_array[2] <<= 1;                                                      \
-    sad_array[3] <<= 1;                                                      \
-  }
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
-
-#if !CONFIG_REALTIME_ONLY
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
-#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index 919eb55..d51f639 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -61,6 +61,7 @@
   return horizontal_add_u16x8(sum);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
@@ -82,6 +83,7 @@
 
   return horizontal_add_u32x4(sum_u32);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
                                                  int src_stride,
@@ -283,3 +285,225 @@
 
 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t aom_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
+  }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sadxd_neon.c b/aom_dsp/arm/highbd_sadxd_neon.c
new file mode 100644
index 0000000..85ca673
--- /dev/null
+++ b/aom_dsp/arm/highbd_sadxd_neon.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+  } while (++i < h);
+
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad128xhx4d_large_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+    int ref_stride, uint32_t res[4], int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+                              128, h);
+}
+
+static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+                              h);
+}
+
+static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+                              h);
+}
+
+#define HBD_SAD_WXH_4D_SMALL_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+#define HBD_SAD_WXH_4D_LARGE_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h)                                 \
+  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array,          \
+                                    2 * ref_stride, sad_array, ((h) >> 1));  \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h)                                 \
+  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array,          \
+                                    2 * ref_stride, sad_array, ((h) >> 1));  \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+  res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+  res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+    uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+    uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+    uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+  res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+  res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum[3];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+    int ref_stride, uint32_t res[4], int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+                              128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+                              h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+                              h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x3d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x3d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sse_neon.c b/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000..184e9f9
--- /dev/null
+++ b/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int height) {
+  uint32x4_t sse[16];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+  highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+    highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      uint16x8_t abs_diff = vabdq_u16(s, r);
+      uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+      uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+      uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_u32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/aom_dsp/arm/highbd_subpel_variance_neon.c b/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000..bdbbf70
--- /dev/null
+++ b/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, blend);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+                                               uint16_t *dst_ptr,
+                                               int src_stride, int pixel_step,
+                                               int dst_height,
+                                               int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 128, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
+  uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+                                                                              \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
+                                                                              \
+    return aom_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                   \
+  unsigned int                                                                \
+      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(      \
+          const uint8_t *src, int src_stride, int xoffset, int yoffset,       \
+          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,     \
+          const uint8_t *msk, int msk_stride, int invert_mask,                \
+          unsigned int *sse) {                                                \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * (h + 1)];                                               \
+    uint16_t tmp2[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);         \
+    aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w,  \
+                                   h, CONVERT_TO_BYTEPTR(tmp1), w, msk,       \
+                                   msk_stride, invert_mask);                  \
+    return aom_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                   \
+  }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)        \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
+          const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
+          const uint8_t *msk, int msk_stride, int invert_mask,                 \
+          unsigned int *sse) {                                                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp0[w * h];                                                    \
+      if (yoffset == 0) {                                                      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred,  \
+                                       w, h, src, src_stride, msk, msk_stride, \
+                                       invert_mask);                           \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride,   \
+                                      w, h);                                   \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride,          \
+                                           src_stride, h, yoffset);            \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      if (yoffset == 0) {                                                      \
+        uint16_t tmp0[w * h];                                                  \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp0[w * (h + 1)];                                            \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp0[w * (h + 1)];                                            \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                \
+  unsigned int                                                              \
+      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(      \
+          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,     \
+          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {    \
+    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                           \
+    uint16_t tmp0[w * (h + 1)];                                             \
+    uint16_t tmp1[w * h];                                                   \
+    highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+                                       xoffset);                            \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);       \
+    return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(           \
+        CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                      \
+  }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)       \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(         \
+          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,        \
+          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {       \
+    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                              \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            pre, pre_stride, wsrc, mask, sse);                                 \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+                                      h);                                      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride,           \
+                                           pre_stride, h, yoffset);            \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h);     \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h,    \
+                                           xoffset);                           \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
+                                           h + 1, xoffset);                    \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
+                                           h + 1, xoffset);                    \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                                     int src_stride, int dst_width,
+                                     int dst_height,
+                                     const uint16_t *second_pred,
+                                     const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+  const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+    vst1_u16(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)              \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+          const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+          const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
+          const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1,     \
+                                       xoffset);                               \
+    highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                      \
+        tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                      \
+    return aom_highbd_##bitdepth##_variance##w##x##h(                          \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);                \
+  }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)  \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+          const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+          const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
+          const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second,        \
+                                 jcp_param);                                   \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      } else if (yoffset == 4) {                                               \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
+            src, tmp, source_stride, source_stride, w, h, second, jcp_param);  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      } else {                                                                 \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second,        \
+            jcp_param);                                                        \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
+            src, tmp0, source_stride, 1, w, h, second, jcp_param);             \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
+                                                        h, second, jcp_param); \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            src, tmp0, source_stride, 1, h, xoffset, second, jcp_param);       \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+                                           xoffset);                           \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
+                                                        h, second, jcp_param); \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+                                           xoffset);                           \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c
index 948f2f7..e54fc18 100644
--- a/aom_dsp/arm/highbd_variance_neon.c
+++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -15,10 +15,10 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
 
 // Process a block of width 4 two rows at a time.
 static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
@@ -412,67 +412,6 @@
   return *sse;
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
-                                            int src_stride,
-                                            const uint16_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            unsigned int *sse) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h / 2;
-  do {
-    uint16x8_t s0 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    uint16x8_t s1 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    uint16x8_t r0 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-    uint16x8_t r1 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-
-    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    uint8x16_t diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-  } while (--i != 0);
-
-  *sse = horizontal_add_u32x4(sse_u32);
-  return *sse;
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
-                                             int src_stride,
-                                             const uint16_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             unsigned int *sse) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint16x8_t s0 = vld1q_u16(src_ptr);
-    uint16x8_t s1 = vld1q_u16(src_ptr + 8);
-    uint16x8_t r0 = vld1q_u16(ref_ptr);
-    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
-
-    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    uint8x16_t diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  *sse = horizontal_add_u32x4(sse_u32);
-  return *sse;
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
@@ -491,8 +430,6 @@
                              sse);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define HIGHBD_MSE_WXH_NEON(w, h)                                       \
   uint32_t aom_highbd_8_mse##w##x##h##_neon(                            \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
@@ -529,3 +466,55 @@
 HIGHBD_MSE_WXH_NEON(8, 8)
 
 #undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+                                                uint16x8_t s1, uint16x8_t d0,
+                                                uint16x8_t d1) {
+  uint16x8_t e0 = vabdq_u16(s0, d0);
+  uint16x8_t e1 = vabdq_u16(s1, d1);
+
+  uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+  mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+  mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+  mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+  return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  if (w == 8) {
+    do {
+      uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+      uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+      uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+      uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+      sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+      dst += 2 * dstride;
+      src += 2 * sstride;
+      h -= 2;
+    } while (h != 0);
+  } else {  // w == 4
+    do {
+      uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+      uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+      sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+      dst += 4 * dstride;
+      src += 4 * sstride;
+      h -= 4;
+    } while (h != 0);
+  }
+
+  return horizontal_add_u64x2(sum);
+}
diff --git a/aom_dsp/arm/highbd_variance_neon_dotprod.c b/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000..d56ae97
--- /dev/null
+++ b/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h,
+                                                    unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    uint16x8_t s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    uint16x8_t r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h,
+                                                     unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0 = vld1q_u16(src_ptr);
+    uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                 \
+  uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+                                     sse);                                \
+    return *sse;                                                          \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 2161378..41f070e 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -24,7 +24,7 @@
 // DC 4x4
 
 static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
-  const uint8x8_t a = load_u8_4x1_lane0(in);
+  const uint8x8_t a = load_u8_4x1(in);
   const uint16x4_t p0 = vpaddl_u8(a);
   const uint16x4_t p1 = vpadd_u16(p0, p0);
   return vcombine_u16(p1, vdup_n_u16(0));
@@ -354,7 +354,7 @@
 
 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  uint8x8_t a = load_u8_4x1_lane0(above);
+  uint8x8_t a = load_u8_4x1(above);
   uint8x8_t l = vld1_u8(left);
   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
   uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
@@ -364,7 +364,7 @@
 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   uint8x8_t a = vld1_u8(above);
-  uint8x8_t l = load_u8_4x1_lane0(left);
+  uint8x8_t l = load_u8_4x1(left);
   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
   uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
   dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
@@ -372,7 +372,7 @@
 
 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  uint8x8_t a = load_u8_4x1_lane0(above);
+  uint8x8_t a = load_u8_4x1(above);
   uint8x16_t l = vld1q_u8(left);
   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
   uint32_t sum = horizontal_add_u16x8(sum_al);
@@ -383,7 +383,7 @@
 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   uint8x16_t a = vld1q_u8(above);
-  uint8x8_t l = load_u8_4x1_lane0(left);
+  uint8x8_t l = load_u8_4x1(left);
   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
   uint32_t sum = horizontal_add_u16x8(sum_al);
   uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
@@ -620,7 +620,7 @@
 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  v_store_4xh(dst, stride, 4, load_u8_4x1_lane0(above));
+  v_store_4xh(dst, stride, 4, load_u8_4x1(above));
 }
 
 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -646,13 +646,13 @@
 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  v_store_4xh(dst, stride, 8, load_u8_4x1_lane0(above));
+  v_store_4xh(dst, stride, 8, load_u8_4x1(above));
 }
 
 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)left;
-  v_store_4xh(dst, stride, 16, load_u8_4x1_lane0(above));
+  v_store_4xh(dst, stride, 16, load_u8_4x1(above));
 }
 
 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -856,7 +856,7 @@
 
 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  const uint8x8_t d0 = load_u8_4x1(left);
   (void)above;
   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0);
   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0);
@@ -907,7 +907,7 @@
 
 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  const uint8x8_t d0 = load_u8_4x1(left);
   (void)above;
   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
@@ -936,7 +936,7 @@
 
 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  const uint8x8_t d0 = load_u8_4x1(left);
   (void)above;
   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
@@ -1594,8 +1594,10 @@
       base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
 
 #if AOM_ARCH_AARCH64
-      uint8x8_t left_idx0 = vreinterpret_u8_s16(base_y_c64 + 2);  // [0, 16]
-      uint8x8_t left_idx1 = vreinterpret_u8_s16(base_y_c64 + 3);  // [1, 17]
+      uint8x8_t left_idx0 =
+          vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2)));  // [0, 16]
+      uint8x8_t left_idx1 =
+          vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3)));  // [1, 17]
 
       uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8);
       uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8);
@@ -1777,8 +1779,10 @@
       base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
 
 #if AOM_ARCH_AARCH64
-      uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c128 + 2);  // [0, 33]
-      uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c128 + 3);  // [1, 34]
+      uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+          vaddq_s16(base_y_c128, vdupq_n_s16(2)));  // [0, 33]
+      uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+          vaddq_s16(base_y_c128, vdupq_n_s16(3)));  // [1, 34]
       uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
 
       uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
@@ -2025,8 +2029,10 @@
 
 #if AOM_ARCH_AARCH64
           // Values in left_idx{0,1} range from 0 through 63 inclusive.
-          uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c256.val[0] + 1);
-          uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c256.val[1] + 1);
+          uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+              vaddq_s16(base_y_c256.val[0], vdupq_n_s16(1)));
+          uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+              vaddq_s16(base_y_c256.val[1], vdupq_n_s16(1)));
 
           uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
 
@@ -3168,12 +3174,10 @@
   const uint8_t bottom_left = left_column[height - 1];
   const uint8_t *const weights_y = smooth_weights + height - 4;
 
-  uint8x8_t UNINITIALIZED_IS_SAFE(top_v);
-  load_u8_4x1(top_row, &top_v, 0);
+  uint8x8_t top_v = load_u8_4x1(top_row);
   const uint8x8_t top_right_v = vdup_n_u8(top_right);
   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
-  uint8x8_t UNINITIALIZED_IS_SAFE(weights_x_v);
-  load_u8_4x1(smooth_weights, &weights_x_v, 0);
+  uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
   const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
 
@@ -3403,9 +3407,9 @@
     const uint8_t bottom_left = left_column[height - 1];              \
     const uint8_t *const weights_y = smooth_weights + height - 4;     \
                                                                       \
-    uint8x8_t UNINITIALIZED_IS_SAFE(top_v);                           \
+    uint8x8_t top_v;                                                  \
     if ((W) == 4) {                                                   \
-      load_u8_4x1(top_row, &top_v, 0);                                \
+      top_v = load_u8_4x1(top_row);                                   \
     } else { /* width == 8 */                                         \
       top_v = vld1_u8(top_row);                                       \
     }                                                                 \
@@ -3717,9 +3721,9 @@
                                        int width, int height) {
   const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
-  uint8x8_t UNINITIALIZED_IS_SAFE(top);
+  uint8x8_t top;
   if (width == 4) {
-    load_u8_4x1(top_row, &top, 0);
+    top = load_u8_4x1(top_row);
   } else {  // width == 8
     top = vld1_u8(top_row);
   }
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 8fc7ccb..0e683a7 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -634,13 +634,13 @@
   p6p2 = vget_low_u8(row1);
   p5p1 = vget_low_u8(row2);
   p4p0 = vget_low_u8(row3);
-  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+  transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
 
   q0q4 = vget_high_u8(row0);
   q1q5 = vget_high_u8(row1);
   q2q6 = vget_high_u8(row2);
   q3qy = vget_high_u8(row3);
-  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+  transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
 
   pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
   pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
@@ -679,13 +679,13 @@
   q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
   q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
   q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
-  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+  transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
 
   pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
   p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
   p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
   p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
-  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+  transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
 
   row0 = vcombine_u8(pxp3, q0q4);
   row1 = vcombine_u8(p6p2, q1q5);
@@ -725,7 +725,7 @@
   // row3: p3 p2 p1 p0 | q0 q1 q2 q3
   load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
 
-  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
 
   pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
   p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
@@ -750,7 +750,7 @@
   p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
   p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
   p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
-  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
 
   store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }
@@ -784,7 +784,7 @@
   // row3: px p2 p1 p0 | q0 q1 q2 qy
   load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
 
-  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+  transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
 
   pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
   pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
@@ -809,7 +809,7 @@
   p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
   p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
   pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
-  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+  transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
 
   store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
 }
@@ -834,7 +834,7 @@
                              const uint8_t *limit, const uint8_t *thresh) {
   uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
   uint32x2_t pq_rev;
-  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), UNINITIALIZED_IS_SAFE(q0q1);
+  uint8x8_t p1p0, q0q1;
   uint8x8_t p0q0, p1q1;
 
   // row0: p1 p0 | q0 q1
@@ -843,7 +843,7 @@
   // row3: p1 p0 | q0 q1
   load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
 
-  transpose_u8_4x4(&p1p0, &q0q1);
+  transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
 
   p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
 
@@ -860,7 +860,7 @@
   p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
   q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
 
-  transpose_u8_4x4(&p1p0, &q0q1);
+  transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
 
   store_unaligned_u8_4x1(src - 2, p1p0, 0);
   store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
@@ -886,25 +886,13 @@
 
 void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1),
-      UNINITIALIZED_IS_SAFE(p2q2), UNINITIALIZED_IS_SAFE(p3q3),
-      UNINITIALIZED_IS_SAFE(p4q4), UNINITIALIZED_IS_SAFE(p5q5),
-      UNINITIALIZED_IS_SAFE(p6q6);
-
-  load_u8_4x1(src - 7 * stride, &p6q6, 0);
-  load_u8_4x1(src - 6 * stride, &p5q5, 0);
-  load_u8_4x1(src - 5 * stride, &p4q4, 0);
-  load_u8_4x1(src - 4 * stride, &p3q3, 0);
-  load_u8_4x1(src - 3 * stride, &p2q2, 0);
-  load_u8_4x1(src - 2 * stride, &p1q1, 0);
-  load_u8_4x1(src - 1 * stride, &p0q0, 0);
-  load_u8_4x1(src + 0 * stride, &p0q0, 1);
-  load_u8_4x1(src + 1 * stride, &p1q1, 1);
-  load_u8_4x1(src + 2 * stride, &p2q2, 1);
-  load_u8_4x1(src + 3 * stride, &p3q3, 1);
-  load_u8_4x1(src + 4 * stride, &p4q4, 1);
-  load_u8_4x1(src + 5 * stride, &p5q5, 1);
-  load_u8_4x1(src + 6 * stride, &p6q6, 1);
+  uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+  uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+  uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+  uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+  uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+  uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+  uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
 
   lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
               *thresh);
@@ -1036,12 +1024,8 @@
 
 void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1);
-
-  load_u8_4x1(src - 2 * stride, &p1q1, 0);
-  load_u8_4x1(src - 1 * stride, &p0q0, 0);
-  load_u8_4x1(src + 0 * stride, &p0q0, 1);
-  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+  uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+  uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
 
   lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
 
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
index 98daeda..8f65b80 100644
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ b/aom_dsp/arm/masked_sad4d_neon.c
@@ -516,19 +516,18 @@
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
-#define MASKED_SAD4D_WXH_NEON(w, h)                                           \
-  void aom_masked_sad##w##x##h##x4d_neon(                                     \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int invert_mask, uint32_t res[4]) {                     \
-    if (invert_mask) {                                                        \
-      return masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,  \
-                                           second_pred, msk, msk_stride, res, \
-                                           h);                                \
-    } else {                                                                  \
-      return masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,      \
-                                       second_pred, msk, msk_stride, res, h); \
-    }                                                                         \
+#define MASKED_SAD4D_WXH_NEON(w, h)                                            \
+  void aom_masked_sad##w##x##h##x4d_neon(                                      \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int invert_mask, uint32_t res[4]) {                      \
+    if (invert_mask) {                                                         \
+      masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,          \
+                                    second_pred, msk, msk_stride, res, h);     \
+    } else {                                                                   \
+      masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+                                msk, msk_stride, res, h);                      \
+    }                                                                          \
   }
 
 MASKED_SAD4D_WXH_NEON(4, 8)
diff --git a/aom_dsp/arm/masked_sad_neon.c b/aom_dsp/arm/masked_sad_neon.c
index 340df05..9d26310 100644
--- a/aom_dsp/arm/masked_sad_neon.c
+++ b/aom_dsp/arm/masked_sad_neon.c
@@ -15,9 +15,10 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/blend.h"
-#include "mem_neon.h"
-#include "sum_neon.h"
 
 static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
                                               const uint8_t *src,
@@ -29,15 +30,7 @@
   uint8x16_t b0 = vld1q_u8(b);
   uint8x16_t s0 = vld1q_u8(src);
 
-  uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
-  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
-  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
-  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
-
-  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
-  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
-  uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+  uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
 
   return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
 }
@@ -164,10 +157,7 @@
     uint8x8_t b0 = vld1_u8(b);
     uint8x8_t s0 = vld1_u8(src);
 
-    uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-    uint16x8_t blend_u16 = vmull_u8(m0, a0);
-    blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
-    uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+    uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
 
     sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
 
@@ -199,10 +189,7 @@
     uint8x8_t b0 = load_unaligned_u8(b, b_stride);
     uint8x8_t s0 = load_unaligned_u8(src, src_stride);
 
-    uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-    uint16x8_t blend_u16 = vmull_u8(m0, a0);
-    blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
-    uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+    uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
 
     sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
 
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 16d44c5..d1ac648 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -43,6 +43,11 @@
   return res;
 }
 
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+  uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+  return res;
+}
+
 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
@@ -85,18 +90,31 @@
   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
 }
 
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define load_u8_4x1(s, s0, lane)                                           \
-  do {                                                                     \
-    *(s0) = vreinterpret_u8_u32(                                           \
-        vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
-  } while (0)
-
 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
-static INLINE uint8x8_t load_u8_4x1_lane0(const uint8_t *p) {
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
   uint8x8_t ret = vdup_n_u8(0);
-  load_u8_4x1(p, &ret, 0);
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+  return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+  uint8x8_t ret = vdup_n_u8(0);
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+  p += stride;
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+  return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+  uint16x4_t ret = vdup_n_u16(0);
+  ret = vreinterpret_u16_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+  p += stride;
+  ret = vreinterpret_u16_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
   return ret;
 }
 
@@ -214,6 +232,38 @@
   s += p;
 }
 
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+                                 int16x4_t *const s0, int16x4_t *const s1,
+                                 int16x4_t *const s2, int16x4_t *const s3,
+                                 int16x4_t *const s4, int16x4_t *const s5,
+                                 int16x4_t *const s6, int16x4_t *const s7,
+                                 int16x4_t *const s8, int16x4_t *const s9,
+                                 int16x4_t *const s10, int16x4_t *const s11) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+  s += p;
+  *s8 = vld1_s16(s);
+  s += p;
+  *s9 = vld1_s16(s);
+  s += p;
+  *s10 = vld1_s16(s);
+  s += p;
+  *s11 = vld1_s16(s);
+}
+
 static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
                                  int16x4_t *const s0, int16x4_t *const s1,
                                  int16x4_t *const s2, int16x4_t *const s3,
@@ -316,6 +366,23 @@
   *s6 = vld1_s16(s);
 }
 
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+}
+
 static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3,
@@ -592,6 +659,33 @@
   *s10 = vld1_u8(s);
 }
 
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+}
+
 static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
                                  int16x8_t *const s0, int16x8_t *const s1,
                                  int16x8_t *const s2, int16x8_t *const s3,
@@ -622,6 +716,38 @@
   *s10 = vld1q_s16(s);
 }
 
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9,
+                                 int16x8_t *const s10, int16x8_t *const s11) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+  s += p;
+  *s10 = vld1q_s16(s);
+  s += p;
+  *s11 = vld1q_s16(s);
+}
+
 static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
                                  uint16x8_t *const s0, uint16x8_t *const s1,
                                  uint16x8_t *const s2, uint16x8_t *const s3,
@@ -714,6 +840,23 @@
   *s6 = vld1q_s16(s);
 }
 
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+}
+
 static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
@@ -793,6 +936,24 @@
   return vreinterpret_u8_u32(a_u32);
 }
 
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(a);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+  uint16_t a;
+  uint16x4_t a_u32;
+
+  memcpy(&a, buf, 2);
+  a_u32 = vdup_n_u16(a);
+  return vreinterpret_u8_u16(a_u32);
+}
+
 static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
   uint32_t a;
   uint32x2_t a_u32;
@@ -844,6 +1005,20 @@
     memcpy(dst, &a, 2);                                \
   } while (0)
 
+#define store_unaligned_u16_2x1(dst, src, lane)         \
+  do {                                                  \
+    uint32_t a;                                         \
+    a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+    memcpy(dst, &a, 4);                                 \
+  } while (0)
+
+#define store_unaligned_u16_4x1(dst, src, lane)           \
+  do {                                                    \
+    uint64_t a;                                           \
+    a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+    memcpy(dst, &a, 8);                                   \
+  } while (0)
+
 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3,
@@ -917,6 +1092,27 @@
   *s7 = vld1q_u16(s + 8);
 }
 
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+                                                int stride) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vdup_n_u32(a);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
 static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
                                                 uint32_t stride) {
   uint64_t a;
@@ -1004,4 +1200,32 @@
   vst1q_s32(buf, v0);
 }
 
+static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride,
+                                          uint8x8_t src) {
+  store_unaligned_u8_2x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u8_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride,
+                                          uint8x8_t src) {
+  store_unaligned_u8_4x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u8_4x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride,
+                                           uint16x4_t src) {
+  store_unaligned_u16_2x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u16_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride,
+                                           uint16x8_t src) {
+  store_unaligned_u16_4x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u16_4x1(dst, src, 1);
+}
+
 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 60efef8..46a1666 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -15,93 +15,10 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int w, int h) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      diff0 = vabdq_u8(s0, r0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      diff1 = vabdq_u8(s1, r1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
-                                         const uint8_t *ref_ptr, int ref_stride,
-                                         int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
-}
-
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
-}
-
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
-}
-
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    diff0 = vabdq_u8(s0, r0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    diff1 = vabdq_u8(s1, r1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *ref_ptr, int ref_stride,
                                          int h) {
@@ -220,28 +137,25 @@
 static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
-  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
 
   int i = h;
   do {
     uint8x16_t s0 = vld1q_u8(src_ptr);
     uint8x16_t r0 = vld1q_u8(ref_ptr);
     uint8x16_t diff0 = vabdq_u8(s0, r0);
-    uint16x8_t sum0 = vpaddlq_u8(diff0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
 
     uint8x16_t s1 = vld1q_u8(src_ptr + 16);
     uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
     uint8x16_t diff1 = vabdq_u8(s1, r1);
-    uint16x8_t sum1 = vpaddlq_u8(diff1);
-
-    sum = vpadalq_u16(sum, sum0);
-    sum = vpadalq_u16(sum, sum1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
   } while (--i != 0);
 
-  return horizontal_add_u32x4(sum);
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
 static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -264,8 +178,6 @@
   return horizontal_add_u16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
@@ -384,114 +296,6 @@
 
 #undef SAD_SKIP_WXH_NEON
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
-                                           int src_stride,
-                                           const uint8_t *ref_ptr,
-                                           int ref_stride, int w, int h,
-                                           const uint8_t *second_pred) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      p0 = vld1q_u8(second_pred);
-      avg0 = vrhaddq_u8(r0, p0);
-      diff0 = vabdq_u8(s0, avg0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      p1 = vld1q_u8(second_pred + 16);
-      avg1 = vrhaddq_u8(r1, p1);
-      diff1 = vabdq_u8(s1, avg1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-      second_pred += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
-                                             int src_stride,
-                                             const uint8_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    p0 = vld1q_u8(second_pred);
-    avg0 = vrhaddq_u8(r0, p0);
-    diff0 = vabdq_u8(s0, avg0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    p1 = vld1q_u8(second_pred);
-    avg1 = vrhaddq_u8(r1, p1);
-    diff1 = vabdq_u8(s1, avg1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
                                              int src_stride,
                                              const uint8_t *ref_ptr,
@@ -644,7 +448,7 @@
                                             const uint8_t *ref_ptr,
                                             int ref_stride, int h,
                                             const uint8_t *second_pred) {
-  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
 
   int i = h;
   do {
@@ -653,24 +457,21 @@
     uint8x16_t p0 = vld1q_u8(second_pred);
     uint8x16_t avg0 = vrhaddq_u8(r0, p0);
     uint8x16_t diff0 = vabdq_u8(s0, avg0);
-    uint16x8_t sum0 = vpaddlq_u8(diff0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
 
     uint8x16_t s1 = vld1q_u8(src_ptr + 16);
     uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
     uint8x16_t p1 = vld1q_u8(second_pred + 16);
     uint8x16_t avg1 = vrhaddq_u8(r1, p1);
     uint8x16_t diff1 = vabdq_u8(s1, avg1);
-    uint16x8_t sum1 = vpaddlq_u8(diff1);
-
-    sum = vpadalq_u16(sum, sum0);
-    sum = vpadalq_u16(sum, sum1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
     second_pred += 32;
   } while (--i != 0);
 
-  return horizontal_add_u32x4(sum);
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
 static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
@@ -698,8 +499,6 @@
   return horizontal_add_u16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *ref_ptr,
@@ -788,3 +587,287 @@
 #endif  // !CONFIG_REALTIME_ONLY
 
 #undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+    uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+    uint8x16_t p4 = vld1q_u8(second_pred + 64);
+    uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+    uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+    uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+    uint8x16_t p5 = vld1q_u8(second_pred + 80);
+    uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+    uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+    uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+    uint8x16_t p6 = vld1q_u8(second_pred + 96);
+    uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+    uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+    uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+    uint8x16_t p7 = vld1q_u8(second_pred + 112);
+    uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+    uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--h != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--h != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+    uint8x16_t diff = vabdq_u8(s, wtd_avg);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+    sum = vabal_u8(sum, s, wtd_avg);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+    sum = vabal_u8(sum, s, wtd_avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h)                                        \
+  unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+                                        second_pred, jcp_param);               \
+  }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000..5504c68
--- /dev/null
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+                                                 int src_stride,
+                                                 const uint8_t *ref_ptr,
+                                                 int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int aom_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int aom_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  unsigned int aom_sad##w##x##h##_avg_neon_dotprod(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  // We use 8 accumulators to minimize the accumulation and loop carried
+  // dependencies for better instruction throughput.
+  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+    uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+    uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+    uint8x16_t p4 = vld1q_u8(second_pred + 64);
+    uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+    uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+    sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+    uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+    uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+    uint8x16_t p5 = vld1q_u8(second_pred + 80);
+    uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+    uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+    sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+    uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+    uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+    uint8x16_t p6 = vld1q_u8(second_pred + 96);
+    uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+    uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+    sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+    uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+    uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+    uint8x16_t p7 = vld1q_u8(second_pred + 112);
+    uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+    uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+    sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[4] = vaddq_u32(sum[4], sum[5]);
+  sum[6] = vaddq_u32(sum[6], sum[7]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+  sum[4] = vaddq_u32(sum[4], sum[6]);
+  sum[0] = vaddq_u32(sum[0], sum[4]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    uint8x16_t s1 = vld1q_u8(src_ptr);
+    uint8x16_t r1 = vld1q_u8(ref_ptr);
+    uint8x16_t p1 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h)                               \
+  unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod(                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \