Merge tag 'v3.7.1' into HEAD

libaom 3.7.1

2023-11-17 v3.7.1
  This release includes several bug fixes. This release is ABI
  compatible with the last release. See
  https://aomedia.googlesource.com/aom/+log/v3.7.0..v3.7.1 for all the
  commits in this release.

  - Bug Fixes
    * aomedia:3349: heap overflow when increasing resolution
    * aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on
      aom/av1/encoder/motion_search_facade.c
    * aomedia:3489: Detect encoder and image high bit depth mismatch
    * aomedia:3491: heap-buffer-overflow on frame size change
    * b/303023614:  Segfault at encoding time for high bit depth images

Bug: aomedia:3513
Change-Id: Iecf1f155b4f0ea2604ef27fef0d6111499ea9bad
diff --git a/.mailmap b/.mailmap
index 7d31a70..6d6e6302 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,12 +1,16 @@
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
 Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
-Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Alexander Bokov <alexanderbokov@google.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Andrey Norkin <anorkin@netflix.com>
 Angie Chiang <angiebird@google.com>
 Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
 Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Aasaipriya Chandran <aasaipriya.c@ittiam.com>
 Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
 Apurve Pandey <apurve.pandey@ittiam.com>
@@ -27,9 +31,10 @@
 Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
 Guillaume Martres <smarter@ubuntu.com>
 Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
-Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
 Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
 Hui Su <huisu@google.com>
 Iole Moccagatta <iole.moccagatta@gmail.com>
 Jacky Chen <jackychen@google.com>
@@ -40,13 +45,14 @@
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Kyle Siefring <siekyleb@amazon.com>
 Kyle Siefring <siekyleb@amazon.com> <kylesiefring@gmail.com>
 Lin Zheng <linzhen@google.com>
-Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Luc Trudeau <luc@trud.ca>
 Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
 Marco Paniconi <marpan@google.com>
@@ -56,6 +62,7 @@
 Mingliang Chen <mlchen@google.com>
 Monty Montgomery <cmontgomery@mozilla.com>
 Mudassir Galaganath <mudassir.galaganath@ittiam.com>
+Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
 Mudassir Galaganath <mudassir.galaganath@ittiam.com> Mudassir Galagnath
 Nathan E. Egge <negge@mozilla.com>
 Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
@@ -72,13 +79,14 @@
 Remya Prakasan <remya.prakasan@ittiam.com>
 Roger Zhou <youzhou@microsoft.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
-Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
 Ryan Lei <ryanlei@fb.com> <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
 Ryan Lei <ryanlei@fb.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
 Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
 Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
 Sarah Parker <sarahparker@google.com>
+Susanna D'Souza <susannad@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
@@ -90,14 +98,16 @@
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
 Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com> <vdziumenko@luxoft.corp-partner.google.com>
 Wei-Ting Lin <weitinglin@google.com>
 Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
 Wenyao Liu <wenyao.liu@cidana.com>
 Will Bresnahan <bill.wresnahan@gmail.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
-Yaowu Xu <yaowu@google.com> <Yaowu Xu>
-Yaowu Xu <yaowu@google.com> <yaowu.google.com>
 Zhipin Deng <zhipin.deng@intel.com>
 Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e0b65f..8e6ca6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -288,9 +288,9 @@
   add_library(aom_static STATIC ${target_objs_aom} $<TARGET_OBJECTS:aom_rtcd>)
   set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
   if(MSVC OR (WIN32 AND NOT MINGW))
-    # Fix race condition on the export library file between the two versions.
-    # Affects MSVC in all three flavors (stock, Clang/CL, LLVM-- the latter sets
-    # MSVC and MINGW both to FALSE).
+    # Fix race condition between the import library and the static library.
+    # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter
+    # sets MSVC and MINGW both to FALSE).
     set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll")
   endif()
 
@@ -323,7 +323,7 @@
   endif()
 endif()
 
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
   list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
               "${AOM_ROOT}/av1/ratectrl_rtc.cc")
   add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
@@ -336,7 +336,7 @@
 
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
-if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
 endif()
 if(BUILD_SHARED_LIBS)
@@ -387,13 +387,6 @@
   endif()
 endif()
 
-if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
-  add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
-                             $<TARGET_OBJECTS:aom_common_app_util>)
-  set_property(TARGET ${example} PROPERTY FOLDER examples)
-  list(APPEND AOM_APP_TARGETS resize_util)
-endif()
-
 if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
   add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
                         $<TARGET_OBJECTS:aom_common_app_util>
@@ -494,14 +487,18 @@
                                     $<TARGET_OBJECTS:aom_common_app_util>
                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
 
-    add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
-                                   $<TARGET_OBJECTS:aom_common_app_util>
-                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
-
     # Maintain a list of encoder example targets.
     list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
                 photon_noise_table set_maps simple_encoder scalable_encoder
-                twopass_encoder svc_encoder_rtc)
+                twopass_encoder)
+
+    if(NOT BUILD_SHARED_LIBS)
+      add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
+                                     $<TARGET_OBJECTS:aom_common_app_util>
+                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+      target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
+      list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc)
+    endif()
   endif()
 
   if(ENABLE_TOOLS)
@@ -852,7 +849,7 @@
 # Aomedia documentation rule.
 set(DOXYGEN_VERSION_VALUE 0)
 if(ENABLE_DOCS)
-  include(FindDoxygen)
+  find_package(Doxygen)
   if(DOXYGEN_FOUND)
     # Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
     set(MINIMUM_DOXYGEN_VERSION 1008010)
@@ -942,7 +939,8 @@
 get_cmake_property(all_cmake_vars VARIABLES)
 foreach(var ${all_cmake_vars})
   if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
-     AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST")
+     AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST"
+     AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER_")
     list(APPEND aom_source_vars ${var})
   endif()
 endforeach()
diff --git a/README.md b/README.md
index d7b66e0..4e2eb27 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@
 The toolchain files available at the time of this writing are:
 
  - arm64-ios.cmake
+ - arm64-linux-clang.cmake
  - arm64-linux-gcc.cmake
  - arm64-mingw-gcc.cmake
  - armv7-ios.cmake
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index e3d8d29..5d0bbe1 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -1006,11 +1006,11 @@
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 
 /*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
-#define AOM_USAGE_GOOD_QUALITY (0)
+#define AOM_USAGE_GOOD_QUALITY 0u
 /*!\brief usage parameter analogous to AV1 REALTIME mode. */
-#define AOM_USAGE_REALTIME (1)
+#define AOM_USAGE_REALTIME 1u
 /*!\brief usage parameter analogous to AV1 all intra mode. */
-#define AOM_USAGE_ALL_INTRA (2)
+#define AOM_USAGE_ALL_INTRA 2u
 
 /*!\brief Encode a frame
  *
diff --git a/aom/aomcx.h b/aom/aomcx.h
index a5db0a5..f061be3 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -208,14 +208,14 @@
    * encoding process, values greater than 0 will increase encoder speed at
    * the expense of quality.
    *
-   * Valid range: 0..10. 0 runs the slowest, and 10 runs the fastest;
+   * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest;
    * quality improves as speed decreases (since more compression
    * possibilities are explored).
    *
-   * NOTE: 10 is only allowed in AOM_USAGE_REALTIME. In AOM_USAGE_GOOD_QUALITY
-   * and AOM_USAGE_ALL_INTRA, 9 is the highest allowed value. However,
-   * AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, AOM_USAGE_REALTIME
-   * treats 0..4 the same as 5.
+   * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In
+   * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed
+   * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also,
+   * AOM_USAGE_REALTIME treats 0..4 the same as 5.
    */
   AOME_SET_CPUUSED = 13,
 
@@ -1527,6 +1527,12 @@
    */
   AV1E_SET_BITRATE_ONE_PASS_CBR = 163,
 
+  /*!\brief Codec control to set the maximum number of consecutive frame drops
+   * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of
+   * zero has no effect.
+   */
+  AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -1678,10 +1684,10 @@
 
 /*!brief Parameters for setting ref frame config */
 typedef struct aom_svc_ref_frame_config {
-  // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2),
-  // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  // 7 references: The index 0 - 6 refers to the references:
+  // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6).
   int reference[7]; /**< Reference flag for each of the 7 references. */
-  /*! Buffer slot index for each of 7 references. */
+  /*! Buffer slot index for each of 7 references indexed above. */
   int ref_idx[7];
   int refresh[8]; /**< Refresh flag for each of the 8 slots. */
 } aom_svc_ref_frame_config_t;
@@ -2172,6 +2178,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_BITRATE_ONE_PASS_CBR, unsigned int)
 #define AOM_CTRL_AV1E_SET_BITRATE_ONE_PASS_CBR
 
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
+#define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4c60e5c..f8f2cbb 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -115,12 +115,17 @@
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
 
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+
 if(CONFIG_AV1_HIGHBITDEPTH)
   list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
               "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
@@ -134,6 +139,11 @@
               "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
 
   list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
 endif()
 
@@ -191,6 +201,9 @@
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
                 "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+                "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
   endif()
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
@@ -269,7 +282,15 @@
               "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
 
   if(CONFIG_AV1_HIGHBITDEPTH)
     list(APPEND AOM_DSP_ENCODER_ASM_SSE2
@@ -292,11 +313,20 @@
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
                 "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c"
-                "${AOM_ROOT}/aom_dsp/arm/highbd_sad4d_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+                "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c")
   endif()
 
   if(CONFIG_INTERNAL_STATS)
@@ -326,6 +356,10 @@
 
     list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
                      "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+                     "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+                     "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
   endif()
 endif()
 
@@ -433,6 +467,23 @@
     endif()
   endif()
 
+  if(HAVE_NEON_DOTPROD)
+    add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD")
+    endif()
+  endif()
+
+  if(HAVE_NEON_I8MM)
+    add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_NEON_I8MM")
+  endif()
+
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
   if(BUILD_SHARED_LIBS)
     target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index efb634a..85dc005 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -23,10 +23,6 @@
 
 #define PI 3.141592653589793238462643383279502884
 
-#ifndef MAX_SB_SIZE
-#define MAX_SB_SIZE 128
-#endif  // ndef MAX_SB_SIZE
-
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
 #define AOMSIGN(x) ((x) < 0 ? -1 : 0)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e738971..c9b2682 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -497,22 +497,22 @@
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-specialize qw/aom_convolve_copy       neon sse2 avx2/;
-specialize qw/aom_convolve8_horiz     neon sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert      neon sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve_copy       neon                        sse2 avx2/;
+specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
 
 add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/aom_scaled_2d ssse3 neon/;
 
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
-  specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+  specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/;
 
   add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
-  specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
+  specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/;
 
   add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
-  specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+  specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/;
 }
 
 #
@@ -750,7 +750,7 @@
 add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
 add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
 add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
+specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/;
 specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
 specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
 
@@ -759,10 +759,10 @@
   add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
   add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
   add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
-  specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+  specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/;
 }
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -773,35 +773,33 @@
   specialize qw/aom_subtract_block neon sse2 avx2/;
 
   add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
-  specialize qw/aom_sse  sse4_1 avx2 neon/;
+  specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/;
 
   add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
-  specialize qw/aom_get_blk_sse_sum sse2 avx2/;
+  specialize qw/aom_get_blk_sse_sum sse2 avx2 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
     specialize qw/aom_highbd_subtract_block sse2 neon/;
 
     add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
-    specialize qw/aom_highbd_sse  sse4_1 avx2 neon/;
+    specialize qw/aom_highbd_sse sse4_1 avx2 neon/;
   }
 
-  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-    #
-    # Sum of Squares
-    #
-    add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
-    specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
+  #
+  # Sum of Squares
+  #
+  add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+  specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
 
-    add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
-    specialize qw/aom_sum_squares_i16 sse2 neon/;
+  add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+  specialize qw/aom_sum_squares_i16 sse2 neon/;
 
-    add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
-    specialize qw/aom_var_2d_u8 sse2 avx2 neon/;
+  add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+  specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/;
 
-    add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
-    specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
-  }
+  add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+  specialize qw/aom_var_2d_u16 sse2 avx2 neon/;
 
   #
   # Single block SAD / Single block Avg SAD
@@ -816,65 +814,65 @@
 
   add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
   specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/;
-  specialize qw/aom_sad128x128    avx2 neon sse2/;
-  specialize qw/aom_sad128x64     avx2 neon sse2/;
-  specialize qw/aom_sad64x128     avx2 neon sse2/;
-  specialize qw/aom_sad64x64      avx2 neon sse2/;
-  specialize qw/aom_sad64x32      avx2 neon sse2/;
-  specialize qw/aom_sad32x64      avx2 neon sse2/;
-  specialize qw/aom_sad32x32      avx2 neon sse2/;
-  specialize qw/aom_sad32x16      avx2 neon sse2/;
-  specialize qw/aom_sad16x32           neon sse2/;
-  specialize qw/aom_sad16x16           neon sse2/;
-  specialize qw/aom_sad16x8            neon sse2/;
-  specialize qw/aom_sad8x16            neon sse2/;
-  specialize qw/aom_sad8x8             neon sse2/;
-  specialize qw/aom_sad8x4             neon sse2/;
-  specialize qw/aom_sad4x8             neon sse2/;
-  specialize qw/aom_sad4x4             neon sse2/;
+  specialize qw/aom_sad128x128    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16            sse2 neon/;
+  specialize qw/aom_sad8x8             sse2 neon/;
+  specialize qw/aom_sad8x4             sse2 neon/;
+  specialize qw/aom_sad4x8             sse2 neon/;
+  specialize qw/aom_sad4x4             sse2 neon/;
 
-  specialize qw/aom_sad4x16            neon sse2/;
-  specialize qw/aom_sad16x4            neon sse2/;
-  specialize qw/aom_sad8x32            neon sse2/;
-  specialize qw/aom_sad32x8            neon sse2/;
-  specialize qw/aom_sad16x64           neon sse2/;
-  specialize qw/aom_sad64x16           neon sse2/;
+  specialize qw/aom_sad4x16            sse2 neon/;
+  specialize qw/aom_sad16x4            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32            sse2 neon/;
+  specialize qw/aom_sad32x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16           sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x128     avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x64      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x32      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x64      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x32      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x16      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_16x32                    sse2  neon/;
-  specialize qw/aom_sad_skip_16x16                    sse2  neon/;
-  specialize qw/aom_sad_skip_16x8                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x16                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x8                      sse2  neon/;
-  specialize qw/aom_sad_skip_8x4                            neon/;
-  specialize qw/aom_sad_skip_4x8                      sse2  neon/;
-  specialize qw/aom_sad_skip_4x4                            neon/;
+  specialize qw/aom_sad_skip_128x128    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_128x64     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x128     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x16      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x32           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x16           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x16            sse2 neon/;
+  specialize qw/aom_sad_skip_8x8             sse2 neon/;
+  specialize qw/aom_sad_skip_8x4                  neon/;
+  specialize qw/aom_sad_skip_4x8             sse2 neon/;
+  specialize qw/aom_sad_skip_4x4                  neon/;
 
-  specialize qw/aom_sad_skip_4x16                     sse2  neon/;
-  specialize qw/aom_sad_skip_16x4                           neon/;
-  specialize qw/aom_sad_skip_8x32                     sse2  neon/;
-  specialize qw/aom_sad_skip_32x8                     sse2  neon/;
-  specialize qw/aom_sad_skip_16x64                    sse2  neon/;
-  specialize qw/aom_sad_skip_64x16                    sse2  neon/;
+  specialize qw/aom_sad_skip_4x16            sse2 neon/;
+  specialize qw/aom_sad_skip_16x4                 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x32            sse2 neon/;
+  specialize qw/aom_sad_skip_32x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x64           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x16           sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
-  specialize qw/aom_sad128x64_avg  avx2 sse2 neon/;
-  specialize qw/aom_sad64x128_avg  avx2 sse2 neon/;
-  specialize qw/aom_sad64x64_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad64x32_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x64_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x32_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x16_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad16x32_avg        sse2 neon/;
-  specialize qw/aom_sad16x16_avg        sse2 neon/;
-  specialize qw/aom_sad16x8_avg         sse2 neon/;
+  specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8_avg         sse2 neon neon_dotprod/;
   specialize qw/aom_sad8x16_avg         sse2 neon/;
   specialize qw/aom_sad8x8_avg          sse2 neon/;
   specialize qw/aom_sad8x4_avg          sse2 neon/;
@@ -882,36 +880,36 @@
   specialize qw/aom_sad4x4_avg          sse2 neon/;
 
   specialize qw/aom_sad4x16_avg         sse2 neon/;
-  specialize qw/aom_sad16x4_avg         sse2 neon/;
+  specialize qw/aom_sad16x4_avg         sse2 neon neon_dotprod/;
   specialize qw/aom_sad8x32_avg         sse2 neon/;
-  specialize qw/aom_sad32x8_avg         sse2 neon/;
-  specialize qw/aom_sad16x64_avg        sse2 neon/;
-  specialize qw/aom_sad64x16_avg        sse2 neon/;
+  specialize qw/aom_sad32x8_avg         sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16_avg        sse2 neon neon_dotprod/;
 
-  specialize qw/aom_dist_wtd_sad128x128_avg sse2/;
-  specialize qw/aom_dist_wtd_sad128x64_avg  sse2/;
-  specialize qw/aom_dist_wtd_sad64x128_avg  sse2/;
-  specialize qw/aom_dist_wtd_sad64x64_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad64x32_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad32x64_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad32x32_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad32x16_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad16x32_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad16x16_avg   sse2/;
-  specialize qw/aom_dist_wtd_sad16x8_avg    sse2/;
-  specialize qw/aom_dist_wtd_sad8x16_avg    sse2/;
-  specialize qw/aom_dist_wtd_sad8x8_avg     sse2/;
-  specialize qw/aom_dist_wtd_sad8x4_avg     sse2/;
-  specialize qw/aom_dist_wtd_sad4x8_avg     sse2/;
-  specialize qw/aom_dist_wtd_sad4x4_avg     sse2/;
+  specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad128x64_avg  sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x128_avg  sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x64_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x64_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x16_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x16_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x8_avg    sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad8x16_avg    sse2 neon/;
+  specialize qw/aom_dist_wtd_sad8x8_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad8x4_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad4x8_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad4x4_avg     sse2 neon/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    specialize qw/aom_dist_wtd_sad4x16_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad16x4_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad8x32_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad32x8_avg     sse2/;
-    specialize qw/aom_dist_wtd_sad16x64_avg    sse2/;
-    specialize qw/aom_dist_wtd_sad64x16_avg    sse2/;
+    specialize qw/aom_dist_wtd_sad4x16_avg     sse2 neon/;
+    specialize qw/aom_dist_wtd_sad16x4_avg     sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad8x32_avg     sse2 neon/;
+    specialize qw/aom_dist_wtd_sad32x8_avg     sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad16x64_avg    sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad64x16_avg    sse2 neon neon_dotprod/;
   }
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -974,27 +972,29 @@
     specialize qw/aom_highbd_sad_skip_16x64   avx2 sse2 neon/;
     specialize qw/aom_highbd_sad_skip_64x16   avx2 sse2 neon/;
 
-    specialize qw/aom_highbd_sad128x128_avg avx2/;
-    specialize qw/aom_highbd_sad128x64_avg  avx2/;
-    specialize qw/aom_highbd_sad64x128_avg  avx2/;
-    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x4_avg     sse2/;
-    specialize qw/aom_highbd_sad4x8_avg     sse2/;
-    specialize qw/aom_highbd_sad4x4_avg     sse2/;
+    specialize qw/aom_highbd_sad128x128_avg avx2      neon/;
+    specialize qw/aom_highbd_sad128x64_avg  avx2      neon/;
+    specialize qw/aom_highbd_sad64x128_avg  avx2      neon/;
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad8x16_avg              neon/;
+    specialize qw/aom_highbd_sad8x8_avg               neon/;
+    specialize qw/aom_highbd_sad8x4_avg          sse2 neon/;
+    specialize qw/aom_highbd_sad4x8_avg          sse2 neon/;
+    specialize qw/aom_highbd_sad4x4_avg          sse2 neon/;
 
-    specialize qw/aom_highbd_sad4x16_avg    sse2/;
-    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x32_avg    sse2/;
-    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad4x16_avg         sse2 neon/;
+    specialize qw/aom_highbd_sad8x32_avg         sse2 neon/;
+    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2 neon/;
   }
   #
   # Masked SAD
@@ -1009,7 +1009,7 @@
     foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
-      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
     }
   }
 
@@ -1030,7 +1030,7 @@
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
         if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
         }
       }
     }
@@ -1047,47 +1047,47 @@
     add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
   }
 
-  specialize qw/aom_sad128x128x4d avx2 neon sse2/;
-  specialize qw/aom_sad128x64x4d  avx2 neon sse2/;
-  specialize qw/aom_sad64x128x4d  avx2 neon sse2/;
-  specialize qw/aom_sad64x64x4d   avx2 neon sse2/;
-  specialize qw/aom_sad64x32x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x64x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x32x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x32x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x8x4d    avx2 neon sse2/;
+  specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8x4d    avx2 sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad8x16x4d         neon sse2/;
-  specialize qw/aom_sad8x8x4d          neon sse2/;
-  specialize qw/aom_sad8x4x4d          neon sse2/;
-  specialize qw/aom_sad4x8x4d          neon sse2/;
-  specialize qw/aom_sad4x4x4d          neon sse2/;
+  specialize qw/aom_sad8x16x4d         sse2 neon/;
+  specialize qw/aom_sad8x8x4d          sse2 neon/;
+  specialize qw/aom_sad8x4x4d          sse2 neon/;
+  specialize qw/aom_sad4x8x4d          sse2 neon/;
+  specialize qw/aom_sad4x4x4d          sse2 neon/;
 
-  specialize qw/aom_sad64x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad32x8x4d    avx2 neon sse2/;
-  specialize qw/aom_sad16x64x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x4x4d    avx2 neon sse2/;
-  specialize qw/aom_sad8x32x4d         neon sse2/;
-  specialize qw/aom_sad4x16x4d         neon sse2/;
+  specialize qw/aom_sad64x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x8x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x4x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32x4d         sse2 neon/;
+  specialize qw/aom_sad4x16x4d         sse2 neon/;
 
-  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_16x4x4d              neon/;
+  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x4x4d              neon neon_dotprod/;
   specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
@@ -1096,29 +1096,29 @@
   specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
   specialize qw/aom_sad_skip_4x4x4d               neon/;
 
-  specialize qw/aom_sad128x128x3d neon avx2/;
-  specialize qw/aom_sad128x64x3d  neon avx2/;
-  specialize qw/aom_sad64x128x3d  neon avx2/;
-  specialize qw/aom_sad64x64x3d   neon avx2/;
-  specialize qw/aom_sad64x32x3d   neon avx2/;
-  specialize qw/aom_sad32x64x3d   neon avx2/;
-  specialize qw/aom_sad32x32x3d   neon avx2/;
-  specialize qw/aom_sad32x16x3d   neon avx2/;
-  specialize qw/aom_sad16x32x3d   neon avx2/;
-  specialize qw/aom_sad16x16x3d   neon avx2/;
-  specialize qw/aom_sad16x8x3d    neon avx2/;
-  specialize qw/aom_sad8x16x3d    neon/;
-  specialize qw/aom_sad8x8x3d     neon/;
-  specialize qw/aom_sad8x4x3d     neon/;
-  specialize qw/aom_sad4x8x3d     neon/;
-  specialize qw/aom_sad4x4x3d     neon/;
+  specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64x3d  avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128x3d  avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8x3d    avx2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16x3d         neon/;
+  specialize qw/aom_sad8x8x3d          neon/;
+  specialize qw/aom_sad8x4x3d          neon/;
+  specialize qw/aom_sad4x8x3d          neon/;
+  specialize qw/aom_sad4x4x3d          neon/;
 
-  specialize qw/aom_sad64x16x3d   neon avx2/;
-  specialize qw/aom_sad32x8x3d    neon avx2/;
-  specialize qw/aom_sad16x64x3d   neon avx2/;
-  specialize qw/aom_sad16x4x3d    neon/;
-  specialize qw/aom_sad8x32x3d    neon/;
-  specialize qw/aom_sad4x16x3d    neon/;
+  specialize qw/aom_sad64x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x8x3d    avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x4x3d         neon neon_dotprod/;
+  specialize qw/aom_sad8x32x3d         neon/;
+  specialize qw/aom_sad4x16x3d         neon/;
 
   specialize qw/aom_masked_sad128x128x4d  ssse3 neon/;
   specialize qw/aom_masked_sad128x64x4d   ssse3 neon/;
@@ -1153,9 +1153,9 @@
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
       if ($w != 128 && $h != 128) {
         specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
       }
@@ -1208,22 +1208,29 @@
     specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2 neon/;
     specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2 neon/;
 
-    specialize qw/aom_highbd_sad128x128x3d avx2/;
-    specialize qw/aom_highbd_sad128x64x3d  avx2/;
-    specialize qw/aom_highbd_sad64x128x3d  avx2/;
-    specialize qw/aom_highbd_sad64x64x3d   avx2/;
-    specialize qw/aom_highbd_sad64x32x3d   avx2/;
-    specialize qw/aom_highbd_sad32x64x3d   avx2/;
-    specialize qw/aom_highbd_sad32x32x3d   avx2/;
-    specialize qw/aom_highbd_sad32x16x3d   avx2/;
-    specialize qw/aom_highbd_sad16x32x3d   avx2/;
-    specialize qw/aom_highbd_sad16x16x3d   avx2/;
-    specialize qw/aom_highbd_sad16x8x3d    avx2/;
+    specialize qw/aom_highbd_sad128x128x3d avx2 neon/;
+    specialize qw/aom_highbd_sad128x64x3d  avx2 neon/;
+    specialize qw/aom_highbd_sad64x128x3d  avx2 neon/;
+    specialize qw/aom_highbd_sad64x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad64x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x8x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad8x16x3d         neon/;
+    specialize qw/aom_highbd_sad8x8x3d          neon/;
+    specialize qw/aom_highbd_sad8x4x3d          neon/;
+    specialize qw/aom_highbd_sad4x8x3d          neon/;
+    specialize qw/aom_highbd_sad4x4x3d          neon/;
 
-    specialize qw/aom_highbd_sad16x4x3d    avx2/;
-    specialize qw/aom_highbd_sad32x8x3d    avx2/;
-    specialize qw/aom_highbd_sad16x64x3d   avx2/;
-    specialize qw/aom_highbd_sad64x16x3d   avx2/;
+    specialize qw/aom_highbd_sad64x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x8x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad16x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x4x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad8x32x3d         neon/;
+    specialize qw/aom_highbd_sad4x16x3d         neon/;
   }
   #
   # Avg
@@ -1323,20 +1330,20 @@
   # Specialty Variance
   #
   add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
-  specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon/;
+  specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon neon_dotprod/;
 
   add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
-  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2 sse2 neon/;
+  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-  specialize qw/aom_mse16x16          sse2 avx2 neon/;
-  specialize qw/aom_mse16x8           sse2      neon/;
-  specialize qw/aom_mse8x16           sse2      neon/;
-  specialize qw/aom_mse8x8            sse2      neon/;
+  specialize qw/aom_mse16x16          sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_mse16x8           sse2      neon neon_dotprod/;
+  specialize qw/aom_mse8x16           sse2      neon neon_dotprod/;
+  specialize qw/aom_mse8x8            sse2      neon neon_dotprod/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
@@ -1345,31 +1352,32 @@
       add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
       add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-      specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
-      specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+      specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon/;
+      specialize "aom_highbd_${bd}_mse16x8", qw/neon/;
+      specialize "aom_highbd_${bd}_mse8x16", qw/neon/;
+      specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon/;
     }
+
+    specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
   }
 
   #
   #
   #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
-  specialize qw/aom_get_mb_ss sse2/;
+  specialize qw/aom_get_mb_ss sse2 neon/;
 
   #
   # Variance / Subpixel Variance / Subpixel Avg Variance
   #
-  add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
   add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
   specialize qw/aom_mse_wxh_16bit  sse2 avx2 neon/;
 
   add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
-  specialize qw/aom_mse_16xh_16bit sse2 avx2/;
+  specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
 
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
@@ -1378,22 +1386,22 @@
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
-  specialize qw/aom_variance128x128   sse2 avx2 neon/;
-  specialize qw/aom_variance128x64    sse2 avx2 neon/;
-  specialize qw/aom_variance64x128    sse2 avx2 neon/;
-  specialize qw/aom_variance64x64     sse2 avx2 neon/;
-  specialize qw/aom_variance64x32     sse2 avx2 neon/;
-  specialize qw/aom_variance32x64     sse2 avx2 neon/;
-  specialize qw/aom_variance32x32     sse2 avx2 neon/;
-  specialize qw/aom_variance32x16     sse2 avx2 neon/;
-  specialize qw/aom_variance16x32     sse2 avx2 neon/;
-  specialize qw/aom_variance16x16     sse2 avx2 neon/;
-  specialize qw/aom_variance16x8      sse2 avx2 neon/;
-  specialize qw/aom_variance8x16      sse2      neon/;
-  specialize qw/aom_variance8x8       sse2      neon/;
-  specialize qw/aom_variance8x4       sse2      neon/;
-  specialize qw/aom_variance4x8       sse2      neon/;
-  specialize qw/aom_variance4x4       sse2      neon/;
+  specialize qw/aom_variance128x128   sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance128x64    sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x128    sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x64     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x16     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x16     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance8x16      sse2      neon neon_dotprod/;
+  specialize qw/aom_variance8x8       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance8x4       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance4x8       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance4x4       sse2      neon neon_dotprod/;
 
   specialize qw/aom_sub_pixel_variance128x128   avx2 neon sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance128x64    avx2 neon sse2 ssse3/;
@@ -1430,12 +1438,12 @@
   specialize qw/aom_sub_pixel_avg_variance4x4          neon sse2 ssse3/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    specialize qw/aom_variance4x16  neon sse2/;
-    specialize qw/aom_variance16x4  neon sse2 avx2/;
-    specialize qw/aom_variance8x32  neon sse2/;
-    specialize qw/aom_variance32x8  neon sse2 avx2/;
-    specialize qw/aom_variance16x64 neon sse2 avx2/;
-    specialize qw/aom_variance64x16 neon sse2 avx2/;
+    specialize qw/aom_variance4x16  neon neon_dotprod sse2/;
+    specialize qw/aom_variance16x4  neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance8x32  neon neon_dotprod sse2/;
+    specialize qw/aom_variance32x8  neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
 
     specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
@@ -1450,82 +1458,259 @@
     specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
 
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
-    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/;
   }
 
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   neon ssse3/;
 
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   neon ssse3/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
       foreach (@encoder_block_sizes) {
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
         add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
         add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-        if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-          if ($bd == 10) {
-            specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse2 neon/;
-          } else {
-            specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
-          }
-        }
-
-        if ($w == 4 || $h == 4) {
-          # TODO(rachelbarker): When ext-partition-types is enabled, we currently
-          # don't have vectorized 4x16 highbd variance functions
-          if ($w == 4 && $h == 4) {
-            if ($bd == 10) {
-              specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse4_1 neon/;
-            } else {
-              specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
-            }
-          } else {
-            if ($bd == 10) {
-              specialize "aom_highbd_${bd}_variance${w}x${h}", qw/neon/;
-            }
-          }
-        }
-
-
-        if ($w != 128 && $h != 128 && $w != 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
-        }
-        if ($w == 4 && $h == 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
-        }
-
         add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
       }
     }
+
+    specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_12_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_12_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_12_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_variance8x4          neon/;
+    specialize qw/aom_highbd_12_variance4x8          neon/;
+    specialize qw/aom_highbd_12_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance128x64  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance64x128  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance64x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance64x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance32x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance32x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance32x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance16x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance16x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance16x8    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance8x16    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance8x8     sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_variance8x4               neon/;
+    specialize qw/aom_highbd_10_variance4x8               neon/;
+    specialize qw/aom_highbd_10_variance4x4   sse4_1      neon/;
+
+    specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_8_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_8_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_8_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_variance8x4          neon/;
+    specialize qw/aom_highbd_8_variance4x8          neon/;
+    specialize qw/aom_highbd_8_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        my $avx2 = ($bd == 10) ? "avx2" : "";
+        specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
+        specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
+      }
+    }
+
+    specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance4x8          neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance128x64  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x128  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x8    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x16    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x8     sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x4     sse2      neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance4x8               neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance4x4   sse4_1      neon/;
+
+    specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance4x8          neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/;
+      }
+    }
+
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/;
+      }
+    }
+
+    foreach $bd (8, 10, 12) {
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8"   , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16"   , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4"    , qw/neon/;
+    }
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/;
+      }
+    }
   }
   #
   # Masked Variance / Masked Subpixel Variance
@@ -1541,7 +1726,7 @@
       foreach (@encoder_block_sizes) {
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
-        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
       }
     }
   }
@@ -1559,56 +1744,18 @@
     }
 
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      foreach $bd ("_", "_10_", "_12_") {
+      foreach $bd ("_8_", "_10_", "_12_") {
         foreach (@encoder_block_sizes) {
           ($w, $h) = @$_;
           add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
           add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/;
+          specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/;
         }
       }
     }
   }
 
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x32 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x64 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x16 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x32 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x16 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x8 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x16 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x8 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x4 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x8 sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x4 sse2 ssse3/;
-
   #
   # Comp Avg
   #
@@ -1616,469 +1763,25 @@
   specialize qw/aom_comp_avg_pred avx2 neon/;
 
   add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
-  specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
+  specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-
-    add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance128x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance128x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance4x8 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance4x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance4x8 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance4x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance128x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance128x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x128 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x64 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x32 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x4 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance4x8 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance4x4 neon/;
-
-    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-      foreach $bd (8, 10, 12) {
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance64x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance64x16" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance32x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance32x8" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x64", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance16x64" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance16x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance16x4" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance8x32", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance8x32" , qw/neon/;
-
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-        specialize "aom_highbd_${bd}_variance4x16" , qw/neon/;
-      }
-    }
-
-    add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse16x8 neon/;
-    add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse8x16 neon/;
-    add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse16x8 neon/;
-    add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse8x16 neon/;
-    add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse8x8 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse16x16 sse2 neon/;
-
-    add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse16x8 neon/;
-    add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse8x16 neon/;
-    add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse8x8 sse2 neon/;
-
     add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+    specialize qw/aom_highbd_comp_avg_pred neon/;
 
     add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
-    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/;
 
     add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
-    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2/;
+    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2 neon/;
   }
-    #
-    # Subpixel Variance
-    #
-    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
-
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    }
-
 
   add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
   specialize qw/aom_comp_mask_pred ssse3 avx2 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-    specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+    specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/;
   }
 
   # Flow estimation library
@@ -2087,7 +1790,7 @@
     specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
 
     add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
-    specialize qw/aom_compute_flow_at_point sse4_1/;
+    specialize qw/aom_compute_flow_at_point sse4_1 neon/;
   }
 
 }  # CONFIG_AV1_ENCODER
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h
index ab950ca..69da8f2 100644
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -24,12 +24,10 @@
 
 #define SIMD_CHECK 1  // Sanity checks in C equivalents
 
-#if HAVE_NEON
-#include "simd/v256_intrinsics_arm.h"
 // VS compiling for 32 bit targets does not support vector types in
 // structs as arguments, which makes the v256 type of the intrinsics
 // hard to support, so optimizations for this target are disabled.
-#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
 #include "simd/v256_intrinsics_x86.h"
 #else
 #include "simd/v256_intrinsics.h"
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 3d07a0f..c8ee780 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -24,826 +24,6 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-#if AOM_ARCH_AARCH64 && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
-  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
-  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
-  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
-};
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
-                                          const int8x8_t filter,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
-                                          const int8x8_t filter,
-                                          const uint8x16x3_t permute_tbl) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
-  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
-  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filter) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum;
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
-  sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
-                                                  const uint8x16_t samples0_hi,
-                                                  const uint8x16_t samples1_lo,
-                                                  const uint8x16_t samples1_hi,
-                                                  const int8x8_t filter) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
-  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
-  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filter);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filter);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filter);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-#else  // !defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
-                                         const int8x8_t filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
-                                         const int8x8_t filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)x_step_q4;
-  (void)filter_y;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1);
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filter) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
-  sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
-                                                 const int8x16_t samples0_hi,
-                                                 const int8x16_t samples1_lo,
-                                                 const int8x16_t samples1_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filter) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
-  sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
-  sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filter);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filter);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filter);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
-
-#else  // !(AOM_ARCH_AARCH64 &&
-       //   (defined(__ARM_FEATURE_DOTPROD) ||
-       //    defined(__ARM_FEATURE_MATMUL_INT8)))
-
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
@@ -905,7 +85,7 @@
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -918,7 +98,7 @@
 
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
@@ -931,7 +111,7 @@
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      transpose_u8_4x4(&d01, &d23);
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
 
       store_u8_4x1(dst + 0 * dst_stride, d01, 0);
       store_u8_4x1(dst + 1 * dst_stride, d23, 0);
@@ -956,7 +136,7 @@
     if (w == 4) {
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -967,7 +147,8 @@
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
-        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+        transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                               &t3);
         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -978,7 +159,7 @@
         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
-        transpose_u8_8x4(&d0, &d1, &d2, &d3);
+        transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
 
         store_u8_4x1(dst + 0 * dst_stride, d0, 0);
         store_u8_4x1(dst + 1 * dst_stride, d1, 0);
@@ -1002,7 +183,7 @@
 
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1017,7 +198,8 @@
 
         do {
           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                                         &t7);
           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -1036,7 +218,8 @@
           d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
           d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
-          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+          transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                                         &d7);
 
           store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
@@ -1172,5 +355,3 @@
     } while (w != 0);
   }
 }
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 0000000..e565414
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+                                         const int8x8_t filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+                                         const int8x8_t filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filter) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+                                                 const int8x16_t samples0_hi,
+                                                 const int8x16_t samples1_lo,
+                                                 const int8x16_t samples1_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filter) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filter);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filter);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filter);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 0000000..d778e8a
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
+                                          const int8x8_t filter,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+                                          const int8x8_t filter,
+                                          const uint8x16x3_t permute_tbl) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filter) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum;
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+                                                  const uint8x16_t samples0_hi,
+                                                  const uint8x16_t samples1_lo,
+                                                  const uint8x16_t samples1_hi,
+                                                  const int8x8_t filter) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filter);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filter);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filter);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
index 583d832..d746f9e 100644
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -50,3 +50,104 @@
     }
   }
 }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
+    do {
+      s0 = vld1_u16(src);
+      src += src_stride;
+      s1 = vld1_u16(src);
+      src += src_stride;
+
+      vst1_u16(dst, s0);
+      dst += dst_stride;
+      vst1_u16(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
+    do {
+      s0 = vld1q_u16(src);
+      src += src_stride;
+      s1 = vld1q_u16(src);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      dst += dst_stride;
+      vst1q_u16(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      src += src_stride;
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      dst += dst_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    do {
+      const uint16_t *s = src;
+      uint16_t *d = dst;
+      int width = w;
+      do {
+        s0 = vld1q_u16(s);
+        s1 = vld1q_u16(s + 8);
+        s2 = vld1q_u16(s + 16);
+        s3 = vld1q_u16(s + 24);
+        s4 = vld1q_u16(s + 32);
+        s5 = vld1q_u16(s + 40);
+        s6 = vld1q_u16(s + 48);
+        s7 = vld1q_u16(s + 56);
+
+        vst1q_u16(d, s0);
+        vst1q_u16(d + 8, s1);
+        vst1q_u16(d + 16, s2);
+        vst1q_u16(d + 24, s3);
+        vst1q_u16(d + 32, s4);
+        vst1q_u16(d + 40, s5);
+        vst1q_u16(d + 48, s6);
+        vst1q_u16(d + 56, s7);
+        s += 64;
+        d += 64;
+        width -= 64;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index ef2f3af..2e79b2e 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <stdlib.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
@@ -19,75 +20,68 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
-#if !AOM_ARCH_AARCH64
-static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
-  const uint32x4_t b = vpaddlq_u16(a);
-  const uint64x2_t c = vpaddlq_u32(b);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                  vreinterpret_u32_u64(vget_high_u64(c)));
-}
-#endif
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+  const uint8x8_t s0 = load_unaligned_u8(p, stride);
+  const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
 
-unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
-  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
-  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-#if AOM_ARCH_AARCH64
-  const uint32_t d = vaddlvq_u16(c);
-  return (d + 8) >> 4;
-#else
-  const uint32x2_t d = horizontal_add_u16x8_v(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
-#endif
+  const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+  return (sum + (1 << 3)) >> 4;
 }
 
-unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
-  uint16x8_t sum;
-  uint8x8_t b = vld1_u8(a);
-  a += a_stride;
-  uint8x8_t c = vld1_u8(a);
-  a += a_stride;
-  sum = vaddl_u8(b, c);
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+  uint8x8_t s0 = vld1_u8(p);
+  p += stride;
+  uint8x8_t s1 = vld1_u8(p);
+  p += stride;
+  uint16x8_t acc = vaddl_u8(s0, s1);
 
-  for (int i = 0; i < 6; ++i) {
-    const uint8x8_t e = vld1_u8(a);
-    a += a_stride;
-    sum = vaddw_u8(sum, e);
-  }
+  int i = 0;
+  do {
+    const uint8x8_t si = vld1_u8(p);
+    p += stride;
+    acc = vaddw_u8(acc, si);
+  } while (++i < 6);
 
-#if AOM_ARCH_AARCH64
-  const uint32_t d = vaddlvq_u16(sum);
-  return (d + 32) >> 6;
-#else
-  const uint32x2_t d = horizontal_add_u16x8_v(sum);
-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
-#endif
+  const uint32_t sum = horizontal_add_u16x8(acc);
+  return (sum + (1 << 5)) >> 6;
 }
 
 void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
-  for (int k = 0; k < 4; k++) {
-    const int x8_idx = x16_idx + ((k & 1) << 3);
-    const int y8_idx = y16_idx + ((k >> 1) << 3);
-    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
-    avg[k] = aom_avg_8x8_neon(s_tmp, p);
-  }
+  avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+  avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+  avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+  avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
 }
 
 int aom_satd_lp_neon(const int16_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
+  int16x8_t s0 = vld1q_s16(coeff);
+  int16x8_t s1 = vld1q_s16(coeff + 8);
 
-  do {
-    const int16x8_t src0 = vld1q_s16(coeff);
-    const int16x8_t src8 = vld1q_s16(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+  int16x8_t abs0 = vabsq_s16(s0);
+  int16x8_t abs1 = vabsq_s16(s1);
+
+  int32x4_t acc0 = vpaddlq_s16(abs0);
+  int32x4_t acc1 = vpaddlq_s16(abs1);
+
+  length -= 16;
+  coeff += 16;
+
+  while (length != 0) {
+    s0 = vld1q_s16(coeff);
+    s1 = vld1q_s16(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    abs1 = vabsq_s16(s1);
+
+    acc0 = vpadalq_s16(acc0, abs0);
+    acc1 = vpadalq_s16(acc1, abs1);
+
     length -= 16;
     coeff += 16;
-  } while (length != 0);
+  }
 
+  int32x4_t accum = vaddq_s32(acc0, acc1);
   return horizontal_add_s32x4(accum);
 }
 
@@ -180,56 +174,84 @@
   } while (h < height);
 }
 
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
 int aom_satd_neon(const tran_low_t *coeff, int length) {
   const int32x4_t zero = vdupq_n_s32(0);
-  int32x4_t accum = zero;
-  do {
-    const int32x4_t src0 = vld1q_s32(&coeff[0]);
-    const int32x4_t src8 = vld1q_s32(&coeff[4]);
-    const int32x4_t src16 = vld1q_s32(&coeff[8]);
-    const int32x4_t src24 = vld1q_s32(&coeff[12]);
-    accum = vabaq_s32(accum, src0, zero);
-    accum = vabaq_s32(accum, src8, zero);
-    accum = vabaq_s32(accum, src16, zero);
-    accum = vabaq_s32(accum, src24, zero);
+
+  int32x4_t s0 = vld1q_s32(&coeff[0]);
+  int32x4_t s1 = vld1q_s32(&coeff[4]);
+  int32x4_t s2 = vld1q_s32(&coeff[8]);
+  int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+  int32x4_t accum0 = vabsq_s32(s0);
+  int32x4_t accum1 = vabsq_s32(s2);
+  accum0 = vabaq_s32(accum0, s1, zero);
+  accum1 = vabaq_s32(accum1, s3, zero);
+
+  length -= 16;
+  coeff += 16;
+
+  while (length != 0) {
+    s0 = vld1q_s32(&coeff[0]);
+    s1 = vld1q_s32(&coeff[4]);
+    s2 = vld1q_s32(&coeff[8]);
+    s3 = vld1q_s32(&coeff[12]);
+
+    accum0 = vabaq_s32(accum0, s0, zero);
+    accum1 = vabaq_s32(accum1, s1, zero);
+    accum0 = vabaq_s32(accum0, s2, zero);
+    accum1 = vabaq_s32(accum1, s3, zero);
+
     length -= 16;
     coeff += 16;
-  } while (length != 0);
+  }
 
-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return horizontal_add_s32x4(accum);
+  // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+  return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
 }
 
 int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
-  int32x4_t v_mean = vdupq_n_s32(0);
-  int32x4_t v_sse = v_mean;
-  int16x8_t v_ref, v_src;
-  int16x4_t v_low;
+  assert(bwl >= 2 && bwl <= 5);
+  int width = 4 << bwl;
 
-  int i, width = 4 << bwl;
-  for (i = 0; i < width; i += 8) {
-    v_ref = vld1q_s16(&ref[i]);
-    v_src = vld1q_s16(&src[i]);
-    const int16x8_t diff = vsubq_s16(v_ref, v_src);
-    // diff: dynamic range [-510, 510], 10 bits.
-    v_mean = vpadalq_s16(v_mean, diff);
-    v_low = vget_low_s16(diff);
-    v_sse = vmlal_s16(v_sse, v_low, v_low);
-#if AOM_ARCH_AARCH64
-    v_sse = vmlal_high_s16(v_sse, diff, diff);
-#else
-    const int16x4_t v_high = vget_high_s16(diff);
-    v_sse = vmlal_s16(v_sse, v_high, v_high);
-#endif
-  }
-  const int mean = horizontal_add_s32x4(v_mean);
-  const int sse = horizontal_add_s32x4(v_sse);
-  const unsigned int mean_abs = mean >= 0 ? mean : -mean;
-  // (mean * mean): dynamic range 31 bits.
-  const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
-  return var;
+  int16x8_t r = vld1q_s16(ref);
+  int16x8_t s = vld1q_s16(src);
+
+  // diff: dynamic range [-510, 510] 10 (signed) bits.
+  int16x8_t diff = vsubq_s16(r, s);
+  // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+  int16x8_t v_mean = diff;
+  // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+  int32x4_t v_sse[2];
+  v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+  v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+  ref += 8;
+  src += 8;
+  width -= 8;
+
+  do {
+    r = vld1q_s16(ref);
+    s = vld1q_s16(src);
+
+    diff = vsubq_s16(r, s);
+    v_mean = vaddq_s16(v_mean, diff);
+
+    v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+    v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+    ref += 8;
+    src += 8;
+    width -= 8;
+  } while (width != 0);
+
+  // Dynamic range [0, 65280], 16 (unsigned) bits.
+  const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+  const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+  // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+  return sse - ((mean_abs * mean_abs) >> (bwl + 2));
 }
 
 void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
diff --git a/aom_dsp/arm/avg_pred_neon.c b/aom_dsp/arm/avg_pred_neon.c
index 04e0904..b17f7fc 100644
--- a/aom_dsp/arm/avg_pred_neon.c
+++ b/aom_dsp/arm/avg_pred_neon.c
@@ -13,6 +13,9 @@
 #include <assert.h>
 
 #include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/blend.h"
 
@@ -74,6 +77,75 @@
   }
 }
 
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+                                     int width, int height, const uint8_t *ref,
+                                     int ref_stride,
+                                     const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  if (width > 8) {
+    do {
+      const uint8_t *pred_ptr = pred;
+      const uint8_t *ref_ptr = ref;
+      uint8_t *comp_pred_ptr = comp_pred;
+      int w = width;
+
+      do {
+        const uint8x16_t p = vld1q_u8(pred_ptr);
+        const uint8x16_t r = vld1q_u8(ref_ptr);
+
+        const uint8x16_t wtd_avg =
+            dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+        vst1q_u8(comp_pred_ptr, wtd_avg);
+
+        ref_ptr += 16;
+        pred_ptr += 16;
+        comp_pred_ptr += 16;
+        w -= 16;
+      } while (w != 0);
+
+      ref += ref_stride;
+      pred += width;
+      comp_pred += width;
+    } while (--height != 0);
+  } else if (width == 8) {
+    int h = height / 2;
+
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+      const uint8x16_t wtd_avg =
+          dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+      vst1q_u8(comp_pred, wtd_avg);
+
+      ref += 2 * ref_stride;
+      pred += 16;
+      comp_pred += 16;
+    } while (--h != 0);
+  } else {
+    int h = height / 2;
+    assert(width == 4);
+
+    do {
+      const uint8x8_t p = vld1_u8(pred);
+      const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+      const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+                                                  vget_low_u8(bck_offset));
+
+      vst1_u8(comp_pred, wtd_avg);
+
+      ref += 2 * ref_stride;
+      pred += 8;
+      comp_pred += 8;
+    } while (--h != 0);
+  }
+}
+
 void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
                              int height, const uint8_t *ref, int ref_stride,
                              const uint8_t *mask, int mask_stride,
@@ -84,7 +156,6 @@
   const int src_stride1 = invert_mask ? ref_stride : width;
 
   if (width > 8) {
-    const uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
     do {
       const uint8_t *src0_ptr = src0;
       const uint8_t *src1_ptr = src1;
@@ -97,19 +168,7 @@
         const uint8x16_t s1 = vld1q_u8(src1_ptr);
         const uint8x16_t m0 = vld1q_u8(mask_ptr);
 
-        uint8x16_t m0_inv = vsubq_u8(max_alpha, m0);
-        uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(m0));
-        uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(m0));
-        blend_u16_lo =
-            vmlal_u8(blend_u16_lo, vget_low_u8(s1), vget_low_u8(m0_inv));
-        blend_u16_hi =
-            vmlal_u8(blend_u16_hi, vget_high_u8(s1), vget_high_u8(m0_inv));
-
-        uint8x8_t blend_u8_lo =
-            vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
-        uint8x8_t blend_u8_hi =
-            vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
-        uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+        uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
 
         vst1q_u8(comp_pred_ptr, blend_u8);
 
@@ -126,17 +185,12 @@
       comp_pred += width;
     } while (--height != 0);
   } else if (width == 8) {
-    const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
-
     do {
       const uint8x8_t s0 = vld1_u8(src0);
       const uint8x8_t s1 = vld1_u8(src1);
       const uint8x8_t m0 = vld1_u8(mask);
 
-      uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
-      uint16x8_t blend_u16 = vmull_u8(s0, m0);
-      blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
-      uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+      uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
 
       vst1_u8(comp_pred, blend_u8);
 
@@ -146,7 +200,6 @@
       comp_pred += 8;
     } while (--height != 0);
   } else {
-    const uint8x8_t max_alpha = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
     int h = height / 2;
     assert(width == 4);
 
@@ -155,10 +208,7 @@
       const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
       const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
 
-      uint8x8_t m0_inv = vsub_u8(max_alpha, m0);
-      uint16x8_t blend_u16 = vmull_u8(s0, m0);
-      blend_u16 = vmlal_u8(blend_u16, s1, m0_inv);
-      uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+      uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
 
       vst1_u8(comp_pred, blend_u8);
 
diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index c3ee0b7..7b1b66a 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c
@@ -12,117 +12,34 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_ports/mem.h"
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
-                            const int16x8_t v_maxval, int16x8_t *res) {
-  int32x4_t im_res_low, im_res_high;
-  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
 
-  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
-  im_res_low =
-      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+                                    uint16x8_t round_offset) {
+  const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
 
-  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
-  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
-                          vget_high_s16(src_1));
+  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+  uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
 
-  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
-                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
-}
+  blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+  blend_u32_hi =
+      vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
 
-static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
-                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
-                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
-                             int16x8_t mask3, const int16x8_t v_maxval,
-                             const uint16x8_t vec_round_offset,
-                             const int16x8_t vec_round_bits) {
-  int16x8_t src0_0, src0_1, src0_2, src0_3;
-  int16x8_t src1_0, src1_1, src1_2, src1_3;
-  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+  uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
 
-  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
-               &src0_3);
-  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
-               &src1_3);
+  uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
 
-  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
-  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
-  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
-  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+  res = vqsubq_u16(res, round_offset);
 
-  uint16x8_t im_res1_0 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
-  uint16x8_t im_res1_1 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
-  uint16x8_t im_res1_2 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
-  uint16x8_t im_res1_3 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
-
-  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
-  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
-  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
-  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
-
-  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
-  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
-  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
-  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
-}
-
-static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
-                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
-                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
-                             int16x4_t mask3, const int16x8_t v_maxval,
-                             const uint16x8_t vec_round_offset,
-                             const int16x8_t vec_round_bits) {
-  int16x8_t src0_0, src0_1;
-  int16x8_t src1_0, src1_1;
-  uint16x8_t tu0 = vdupq_n_u16(0);
-  uint16x8_t tu1 = vdupq_n_u16(0);
-  uint16x8_t tu2 = vdupq_n_u16(0);
-  uint16x8_t tu3 = vdupq_n_u16(0);
-  int16x8_t mask0_1, mask2_3;
-  int16x8_t res0, res1;
-
-  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
-  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
-
-  src0_0 = vreinterpretq_s16_u16(tu0);
-  src0_1 = vreinterpretq_s16_u16(tu1);
-
-  src1_0 = vreinterpretq_s16_u16(tu2);
-  src1_1 = vreinterpretq_s16_u16(tu3);
-
-  mask0_1 = vcombine_s16(mask0, mask1);
-  mask2_3 = vcombine_s16(mask2, mask3);
-
-  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
-  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
-
-  uint16x8_t im_res_0 =
-      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
-  uint16x8_t im_res_1 =
-      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
-
-  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
-  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
-
-  uint8x8_t res_0 = vqmovun_s16(src0_0);
-  uint8x8_t res_1 = vqmovun_s16(src0_1);
-
-  store_unaligned_u8_4x1(dst + 0 * dst_stride, res_0, 0);
-  store_unaligned_u8_4x1(dst + 1 * dst_stride, res_0, 1);
-  store_unaligned_u8_4x1(dst + 2 * dst_stride, res_1, 0);
-  store_unaligned_u8_4x1(dst + 3 * dst_stride, res_1, 1);
+  return vqrshrn_n_u16(res,
+                       2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
 }
 
 void aom_lowbd_blend_a64_d16_mask_neon(
@@ -130,19 +47,13 @@
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
     ConvolveParams *conv_params) {
-  int i = 0;
-  const int bd = 8;
-  int w_tmp = w;
-  const uint8_t *mask_tmp = mask;
-  const CONV_BUF_TYPE *src0_tmp = src0;
-  const CONV_BUF_TYPE *src1_tmp = src1;
-  uint8_t *dst_tmp = dst;
+  (void)conv_params;
 
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                           (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
 
   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
@@ -152,294 +63,430 @@
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  uint8x8_t s0 = vdup_n_u8(0);
-  uint8x8_t s1 = vdup_n_u8(0);
-  uint8x8_t s2 = vdup_n_u8(0);
-  uint8x8_t s3 = vdup_n_u8(0);
-  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
-  int16x8_t mask0, mask1, mask2, mask3;
-  int16x8_t mask4, mask5, mask6, mask7;
-  int32x4_t m0_32, m1_32, m2_32, m3_32;
-  int32x4_t m4_32, m5_32, m6_32, m7_32;
-  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
-  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
-  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
-  const uint16x4_t vec_zero = vdup_n_u16(0);
-  const uint16_t offset = round_offset - (1 << (round_bits - 1));
-  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
-  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
-  const uint16x8_t vec_offset = vdupq_n_u16(offset);
-
   if (subw == 0 && subh == 0) {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+          uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
-          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
-          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
-          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+          uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
 
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          w_tmp -= 8;
-          mask_tmp += 8;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (4 * mask_stride) - w;
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_unaligned_u8_4x4(mask_tmp, mask_stride, &s0, &s1);
+        uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 = vreinterpretq_s16_u16(vmovl_u8(s0));
-        mask1 = vreinterpretq_s16_u16(vmovl_u8(s1));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
 
-        mask0_low = vget_low_s16(mask0);
-        mask1_low = vget_high_s16(mask0);
-        mask2_low = vget_low_s16(mask1);
-        mask3_low = vget_high_s16(mask1);
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (4 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   } else if (subw == 1 && subh == 1) {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
-                       &t7);
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+          uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
-          mask1 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
-          mask2 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
-          mask3 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+          uint16x8_t m_avg =
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
 
-          mask4 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
-          mask5 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
-          mask6 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
-          mask7 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-          m0_32 = vpaddlq_s16(mask0);
-          m1_32 = vpaddlq_s16(mask1);
-          m2_32 = vpaddlq_s16(mask2);
-          m3_32 = vpaddlq_s16(mask3);
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          m4_32 = vpaddlq_s16(mask4);
-          m5_32 = vpaddlq_s16(mask5);
-          m6_32 = vpaddlq_s16(mask6);
-          m7_32 = vpaddlq_s16(mask7);
-
-          mask0 =
-              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
-          mask1 =
-              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
-          mask2 =
-              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
-          mask3 =
-              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
-
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 16;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (8 * mask_stride) - (2 * w);
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
-        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
-        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
-        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        m0_32 = vpaddlq_s16(mask0);
-        m1_32 = vpaddlq_s16(mask1);
-        m2_32 = vpaddlq_s16(mask2);
-        m3_32 = vpaddlq_s16(mask3);
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        mask0_low = vqrshrn_n_s32(m0_32, 2);
-        mask1_low = vqrshrn_n_s32(m1_32, 2);
-        mask2_low = vqrshrn_n_s32(m2_32, 2);
-        mask3_low = vqrshrn_n_s32(m3_32, 2);
-
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (8 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   } else if (subw == 1 && subh == 0) {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+          uint8x8_t m0 = vld1_u8(mask + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
-          mask1 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
-          mask2 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
-          mask3 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
-          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-          w_tmp -= 8;
-          mask_tmp += 16;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (4 * mask_stride) - (2 * w);
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                    &mask3_l);
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
-        mask1 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
-        mask2 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
-        mask3 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
-        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
-        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
-        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (4 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   } else {
-    if (w_tmp > 7) {
+    if (w >= 8) {
       do {
-        w_tmp = w;
+        int i = 0;
         do {
-          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
 
-          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
-          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
-          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
-          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
-          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
 
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 8;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (8 * mask_stride) - w;
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
     } else {
       do {
-        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &s0, &s1);
-        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &s2,
-                              &s3);
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
 
-        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
-        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+        uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
 
-        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
 
-        mask0_low = vget_low_s16(mask0);
-        mask1_low = vget_high_s16(mask0);
-        mask2_low = vget_low_s16(mask1);
-        mask3_low = vget_high_s16(mask1);
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
 
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subw, int subh) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
-        i += 4;
-        mask_tmp += (8 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if ((subw | subh) == 0) {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + i);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+          i += 16;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if ((subw & subh) == 1) {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+          uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+        uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w > 8) {
+      do {
+        int i = 0;
+
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+          uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask);
+        uint8x8_t m1 = vld1_u8(mask + 8);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+          uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
     }
   }
 }
diff --git a/aom_dsp/arm/blend_neon.h b/aom_dsp/arm/blend_neon.h
new file mode 100644
index 0000000..c8a03224
--- /dev/null
+++ b/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+                                               uint8x16_t b) {
+  const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+                                             uint8x8_t b) {
+  const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint16x8_t blend_u16 = vmull_u8(m, a);
+
+  blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+  return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+                                               uint16x8_t b) {
+  uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+  uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+  blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+  blend_u32_hi =
+      vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+  uint16x4_t blend_u16_lo =
+      vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint16x4_t blend_u16_hi =
+      vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+                                               uint16x4_t b) {
+  const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u16 = vmull_u16(m, a);
+
+  blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+  return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+  return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+  return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+  return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+  return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+  uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+  uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+  return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif  // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+                                                  uint8x8_t c, uint8x8_t d) {
+  uint8x8_t a_c = vpadd_u8(a, c);
+  uint8x8_t b_d = vpadd_u8(b, d);
+  return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+                                                    uint8x16_t c,
+                                                    uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+  uint8x16_t a_c = vpaddq_u8(a, c);
+  uint8x16_t b_d = vpaddq_u8(b, d);
+  return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+  uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+  uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+  uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+  uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+  uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+  uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+  return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif  // AOM_ARCH_AARCH64
+}
+
+#endif  // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/aom_dsp/arm/blk_sse_sum_neon.c b/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 0000000..f2ada93
--- /dev/null
+++ b/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+                                            int bh, int *x_sum,
+                                            int64_t *x2_sum) {
+  int i = bh;
+  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sse = vdupq_n_s32(0);
+
+  do {
+    int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+    sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+    data += 2 * stride;
+    i -= 2;
+  } while (i != 0);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+                                            int bh, int *x_sum,
+                                            int64_t *x2_sum) {
+  int i = bh;
+  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sse = vdupq_n_s32(0);
+
+  // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+  // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+  // value for bh is 32, we don't have to worry about sse overflowing.
+
+  do {
+    int16x8_t d = vld1q_s16(data);
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+    sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+    data += stride;
+  } while (--i != 0);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+                                              int bw, int bh, int *x_sum,
+                                              int64_t *x2_sum) {
+  int32x4_t sum = vdupq_n_s32(0);
+  int64x2_t sse = vdupq_n_s64(0);
+
+  // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+  // 32-bits element. Since we're accumulating into an int32x4_t vector that
+  // means we can process up to (127*4)/bw rows before we need to widen to
+  // 64 bits.
+
+  int i_limit = (127 * 4) / bw;
+  int i_tmp = bh > i_limit ? i_limit : bh;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32 = vdupq_n_s32(0);
+    do {
+      int j = bw;
+      const int16_t *data_ptr = data;
+      do {
+        int16x8_t d = vld1q_s16(data_ptr);
+
+        sum = vpadalq_s16(sum, d);
+
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+        data_ptr += 8;
+        j -= 8;
+      } while (j != 0);
+
+      data += stride;
+      i++;
+    } while (i < i_tmp && i < bh);
+
+    sse = vpadalq_s32(sse, sse_s32);
+    i_tmp += i_limit;
+  } while (i < bh);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  if (bw == 4) {
+    get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+  } else if (bw == 8) {
+    get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+  } else {
+    assert(bw % 8 == 0);
+    get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/aom_dsp/arm/dist_wtd_avg_neon.h b/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 0000000..19c9b04
--- /dev/null
+++ b/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+                                          uint8x8_t wta, uint8x8_t wtb) {
+  uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+  wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+  return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+                                            uint16x4_t wta, uint16x4_t wtb) {
+  uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+  wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+  return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+                                            uint8x16_t wta, uint8x16_t wtb) {
+  uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+  uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+  wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+  wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+  uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+  uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+  return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+                                            uint16x8_t wta, uint16x8_t wtb) {
+  uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+  uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+  wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+  wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+  uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+  uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+  return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif  // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index a7d66b3..fb4cda7 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -48,8 +48,8 @@
     // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
     const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
     const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
 
     // fdct_round_shift
     int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -57,11 +57,13 @@
 
     // s_3 * cospi_8_64 + s_2 * cospi_24_64
     // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
 
-    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+    const int32x4_t temp3 =
+        vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+    const int32x4_t temp4 =
+        vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
 
     // fdct_round_shift
     int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
@@ -69,7 +71,7 @@
 
     // Only transpose the first pass
     if (i == 0) {
-      transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+      transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
     }
 
     *input_0 = out_0;
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 82ce0cd..d0f5922 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -37,7 +37,7 @@
 
   hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
 
-  transpose_s16_4x4d(&a0, &a1, &a2, &a3);
+  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
 
   hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
 
@@ -91,7 +91,7 @@
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
-  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
@@ -120,7 +120,7 @@
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
-  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
   hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
 
@@ -196,56 +196,90 @@
   /* Bottom right. */
   aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
 
+  // Each iteration of the loop operates on entire rows (16 samples each)
+  // because we need to swap the second and third quarters of every row in the
+  // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+  // loop at the end of aom_hadamard_16x16_c.
   for (int i = 0; i < 64; i += 16) {
-    const int16x8_t a00 = load_tran_low_to_s16q(coeff + 0);
-    const int16x8_t a01 = load_tran_low_to_s16q(coeff + 64);
-    const int16x8_t a02 = load_tran_low_to_s16q(coeff + 128);
-    const int16x8_t a03 = load_tran_low_to_s16q(coeff + 192);
+    const int32x4_t a00 = vld1q_s32(coeff + 0);
+    const int32x4_t a01 = vld1q_s32(coeff + 64);
+    const int32x4_t a02 = vld1q_s32(coeff + 128);
+    const int32x4_t a03 = vld1q_s32(coeff + 192);
 
-    const int16x8_t b00 = vhaddq_s16(a00, a01);
-    const int16x8_t b01 = vhsubq_s16(a00, a01);
-    const int16x8_t b02 = vhaddq_s16(a02, a03);
-    const int16x8_t b03 = vhsubq_s16(a02, a03);
+    const int32x4_t b00 = vhaddq_s32(a00, a01);
+    const int32x4_t b01 = vhsubq_s32(a00, a01);
+    const int32x4_t b02 = vhaddq_s32(a02, a03);
+    const int32x4_t b03 = vhsubq_s32(a02, a03);
 
-    const int16x8_t c00 = vaddq_s16(b00, b02);
-    const int16x8_t c01 = vaddq_s16(b01, b03);
-    const int16x8_t c02 = vsubq_s16(b00, b02);
-    const int16x8_t c03 = vsubq_s16(b01, b03);
+    const int32x4_t c00 = vaddq_s32(b00, b02);
+    const int32x4_t c01 = vaddq_s32(b01, b03);
+    const int32x4_t c02 = vsubq_s32(b00, b02);
+    const int32x4_t c03 = vsubq_s32(b01, b03);
 
-    const int16x8_t a10 = load_tran_low_to_s16q(coeff + 8 + 0);
-    const int16x8_t a11 = load_tran_low_to_s16q(coeff + 8 + 64);
-    const int16x8_t a12 = load_tran_low_to_s16q(coeff + 8 + 128);
-    const int16x8_t a13 = load_tran_low_to_s16q(coeff + 8 + 192);
+    const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+    const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+    const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+    const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
 
-    const int16x8_t b10 = vhaddq_s16(a10, a11);
-    const int16x8_t b11 = vhsubq_s16(a10, a11);
-    const int16x8_t b12 = vhaddq_s16(a12, a13);
-    const int16x8_t b13 = vhsubq_s16(a12, a13);
+    const int32x4_t b10 = vhaddq_s32(a10, a11);
+    const int32x4_t b11 = vhsubq_s32(a10, a11);
+    const int32x4_t b12 = vhaddq_s32(a12, a13);
+    const int32x4_t b13 = vhsubq_s32(a12, a13);
 
-    const int16x8_t c10 = vaddq_s16(b10, b12);
-    const int16x8_t c11 = vaddq_s16(b11, b13);
-    const int16x8_t c12 = vsubq_s16(b10, b12);
-    const int16x8_t c13 = vsubq_s16(b11, b13);
+    const int32x4_t c10 = vaddq_s32(b10, b12);
+    const int32x4_t c11 = vaddq_s32(b11, b13);
+    const int32x4_t c12 = vsubq_s32(b10, b12);
+    const int32x4_t c13 = vsubq_s32(b11, b13);
 
-    store_s16_to_tran_low(coeff + 0 + 0, vget_low_s16(c00));
-    store_s16_to_tran_low(coeff + 0 + 4, vget_low_s16(c10));
-    store_s16_to_tran_low(coeff + 0 + 8, vget_high_s16(c00));
-    store_s16_to_tran_low(coeff + 0 + 12, vget_high_s16(c10));
+    const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+    const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+    const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+    const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
 
-    store_s16_to_tran_low(coeff + 64 + 0, vget_low_s16(c01));
-    store_s16_to_tran_low(coeff + 64 + 4, vget_low_s16(c11));
-    store_s16_to_tran_low(coeff + 64 + 8, vget_high_s16(c01));
-    store_s16_to_tran_low(coeff + 64 + 12, vget_high_s16(c11));
+    const int32x4_t b20 = vhaddq_s32(a20, a21);
+    const int32x4_t b21 = vhsubq_s32(a20, a21);
+    const int32x4_t b22 = vhaddq_s32(a22, a23);
+    const int32x4_t b23 = vhsubq_s32(a22, a23);
 
-    store_s16_to_tran_low(coeff + 128 + 0, vget_low_s16(c02));
-    store_s16_to_tran_low(coeff + 128 + 4, vget_low_s16(c12));
-    store_s16_to_tran_low(coeff + 128 + 8, vget_high_s16(c02));
-    store_s16_to_tran_low(coeff + 128 + 12, vget_high_s16(c12));
+    const int32x4_t c20 = vaddq_s32(b20, b22);
+    const int32x4_t c21 = vaddq_s32(b21, b23);
+    const int32x4_t c22 = vsubq_s32(b20, b22);
+    const int32x4_t c23 = vsubq_s32(b21, b23);
 
-    store_s16_to_tran_low(coeff + 192 + 0, vget_low_s16(c03));
-    store_s16_to_tran_low(coeff + 192 + 4, vget_low_s16(c13));
-    store_s16_to_tran_low(coeff + 192 + 8, vget_high_s16(c03));
-    store_s16_to_tran_low(coeff + 192 + 12, vget_high_s16(c13));
+    const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+    const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+    const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+    const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+    const int32x4_t b30 = vhaddq_s32(a30, a31);
+    const int32x4_t b31 = vhsubq_s32(a30, a31);
+    const int32x4_t b32 = vhaddq_s32(a32, a33);
+    const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+    const int32x4_t c30 = vaddq_s32(b30, b32);
+    const int32x4_t c31 = vaddq_s32(b31, b33);
+    const int32x4_t c32 = vsubq_s32(b30, b32);
+    const int32x4_t c33 = vsubq_s32(b31, b33);
+
+    vst1q_s32(coeff + 0 + 0, c00);
+    vst1q_s32(coeff + 0 + 4, c20);
+    vst1q_s32(coeff + 0 + 8, c10);
+    vst1q_s32(coeff + 0 + 12, c30);
+
+    vst1q_s32(coeff + 64 + 0, c01);
+    vst1q_s32(coeff + 64 + 4, c21);
+    vst1q_s32(coeff + 64 + 8, c11);
+    vst1q_s32(coeff + 64 + 12, c31);
+
+    vst1q_s32(coeff + 128 + 0, c02);
+    vst1q_s32(coeff + 128 + 4, c22);
+    vst1q_s32(coeff + 128 + 8, c12);
+    vst1q_s32(coeff + 128 + 12, c32);
+
+    vst1q_s32(coeff + 192 + 0, c03);
+    vst1q_s32(coeff + 192 + 4, c23);
+    vst1q_s32(coeff + 192 + 8, c13);
+    vst1q_s32(coeff + 192 + 12, c33);
 
     coeff += 16;
   }
diff --git a/aom_dsp/arm/highbd_avg_pred_neon.c b/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000..531309b
--- /dev/null
+++ b/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int src_stride0 = invert_mask ? width : ref_stride;
+  const int src_stride1 = invert_mask ? ref_stride : width;
+
+  if (width >= 8) {
+    do {
+      int j = 0;
+
+      do {
+        const uint16x8_t s0 = vld1q_u16(src0 + j);
+        const uint16x8_t s1 = vld1q_u16(src1 + j);
+        const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+        uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(comp_pred + j, blend_u16);
+
+        j += 8;
+      } while (j < width);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += width;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+
+    do {
+      const uint16x4_t s0 = vld1_u16(src0);
+      const uint16x4_t s1 = vld1_u16(src1);
+      const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+      uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      vst1_u16(comp_pred, blend_u16);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += 4;
+    } while (--height != 0);
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        const uint16x8_t avg =
+            dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      const uint16x8_t avg =
+          dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      const uint16x4_t avg = dist_wtd_avg_u16x4(
+          r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 0000000..bdd2177
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                     const uint8_t *src0_8,
+                                     uint32_t src0_stride,
+                                     const uint8_t *src1_8,
+                                     uint32_t src1_stride, const uint8_t *mask,
+                                     int w, int h, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (w >= 8) {
+    do {
+      int i = 0;
+      do {
+        uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+        uint16x8_t s0 = vld1q_u16(src0 + i);
+        uint16x8_t s1 = vld1q_u16(src1 + i);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(dst + i, blend);
+        i += 8;
+      } while (i < w);
+
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else if (w == 4) {
+    const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+    do {
+      uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+      uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+      store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 8) {
+    const uint16x4_t m0 =
+        vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+    do {
+      uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+      uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+      uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 0000000..36d763a
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits)                               \
+  static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8(                  \
+      uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) {     \
+    const uint16x8_t m_inv =                                                  \
+        vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);                   \
+                                                                              \
+    uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset),  \
+                                        vget_low_u16(m), vget_low_u16(a));    \
+    uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset),  \
+                                        vget_high_u16(m), vget_high_u16(a));  \
+                                                                              \
+    blend_u32_lo =                                                            \
+        vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));        \
+    blend_u32_hi =                                                            \
+        vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));      \
+                                                                              \
+    uint16x4_t blend_u16_lo =                                                 \
+        vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo),                   \
+                       AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS -           \
+                           round0_bits - COMPOUND_ROUND1_BITS);               \
+    uint16x4_t blend_u16_hi =                                                 \
+        vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi),                   \
+                       AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS -           \
+                           round0_bits - COMPOUND_ROUND1_BITS);               \
+                                                                              \
+    uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi);          \
+    blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1));             \
+                                                                              \
+    return blend_u16;                                                         \
+  }                                                                           \
+                                                                              \
+  static INLINE void highbd_##bd##_blend_a64_d16_mask_neon(                   \
+      uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,          \
+      uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  \
+      const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw,      \
+      int subh) {                                                             \
+    const int offset_bits = bd + 2 * FILTER_BITS - round0_bits;               \
+    int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +      \
+                           (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));   \
+    int32x4_t offset =                                                        \
+        vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS));             \
+                                                                              \
+    if ((subw | subh) == 0) {                                                 \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));                      \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);         \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += mask_stride;                                                \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);           \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else if ((subw & subh) == 1) {                                          \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);         \
+            uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);         \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(            \
+                vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0),           \
+                vget_high_u8(m1)));                                           \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);                     \
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);                     \
+          uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);                     \
+          uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);                     \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg =                                                  \
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));            \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 4 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else if (subw == 1 && subh == 0) {                                      \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x8_t m0 = vld1_u8(mask + 2 * i);                             \
+            uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);                         \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));     \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += mask_stride;                                                \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);                     \
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);                     \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));       \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else {                                                                  \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);               \
+            uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);               \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));              \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0_2 =                                                    \
+              load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+          uint8x8_t m1_3 =                                                    \
+              load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));            \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_unaligned_u16_4x2(dst, dst_stride, blend);                    \
+                                                                              \
+          mask += 4 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  (void)conv_params;
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  if (bd == 12) {
+    highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, mask, mask_stride, w, h,
+                                      subw, subh);
+  } else if (bd == 10) {
+    highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, mask, mask_stride, w, h,
+                                      subw, subh);
+  } else {
+    highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h, subw,
+                                     subh);
+  }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                    const uint8_t *src0_8, uint32_t src0_stride,
+                                    const uint8_t *src1_8, uint32_t src1_stride,
+                                    const uint8_t *mask, uint32_t mask_stride,
+                                    int w, int h, int subw, int subh, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if ((subw | subh) == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if ((subw & subh) == 1) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+          uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg =
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
diff --git a/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 0000000..ea3d655
--- /dev/null
+++ b/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                     const uint8_t *src0_8,
+                                     uint32_t src0_stride,
+                                     const uint8_t *src1_8,
+                                     uint32_t src1_stride, const uint8_t *mask,
+                                     int w, int h, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (w >= 8) {
+    do {
+      uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+      int i = 0;
+      do {
+        uint16x8_t s0 = vld1q_u16(src0 + i);
+        uint16x8_t s1 = vld1q_u16(src1 + i);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+        vst1q_u16(dst + i, blend);
+        i += 8;
+      } while (i < w);
+
+      mask += 1;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else if (w == 4) {
+    do {
+      uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+      uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+      uint16x8_t m = vcombine_u16(m1, m2);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+      uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+      store_unaligned_u16_4x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 8) {
+    do {
+      uint16x4_t m0 = vdup_n_u16(0);
+      m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+      uint8x8_t m0_zip =
+          vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+      m0 = vget_low_u16(vmovl_u8(m0_zip));
+      uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+      uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+      uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      store_unaligned_u16_2x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  }
+}
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 0000000..e25438c
--- /dev/null
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+  return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  int32x4_t sum =
+      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  const int16x8_t s2 = vextq_s16(s0, s1, 1);
+  const int16x8_t s3 = vextq_s16(s0, s1, 2);
+  const int16x8_t s4 = vextq_s16(s0, s1, 3);
+  const int16x4_t s0_lo = vget_low_s16(s0);
+  const int16x4_t s1_lo = vget_low_s16(s2);
+  const int16x4_t s2_lo = vget_low_s16(s3);
+  const int16x4_t s3_lo = vget_low_s16(s4);
+  const int16x4_t s4_lo = vget_high_s16(s0);
+  const int16x4_t s5_lo = vget_high_s16(s2);
+  const int16x4_t s6_lo = vget_high_s16(s3);
+  const int16x4_t s7_lo = vget_high_s16(s4);
+
+  return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+                                s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    int32x4_t *sum0, int32x4_t *sum1) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+                                               const int16x8_t s0_hi,
+                                               const int16x8_t x_filter_0_7,
+                                               int32x4_t *sum0,
+                                               int32x4_t *sum1) {
+  const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+  const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+  const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+  const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+  const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+  const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+  const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+                         sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  int32x4_t sum0, sum1;
+  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+  int32x4_t sum0;
+  int32x4_t sum1;
+  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+                         &sum1);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+                                       ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       const int16_t *x_filter_ptr,
+                                       int x_step_q4, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x2(s, src_stride, &s0, &s2);
+      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+      uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+      uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+      uint16x8_t d01 = vcombine_u16(d0, d1);
+      d01 = vminq_u16(d01, max);
+
+      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+      s += 2 * src_stride;
+      d += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+      int x_q4 = 0;
+
+      const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+      int16x8_t s0, s2, s4, s6;
+      load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+      src_x += 8;
+
+      do {
+        int16x8_t s1, s3, s5, s7;
+        load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+        uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+        uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+        uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+        uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s1;
+        s2 = s3;
+        s4 = s5;
+        s6 = s7;
+        src_x += 8;
+        d += 8;
+        width -= 8;
+        x_q4 += 8 * x_step_q4;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    (void)filter_y;
+    (void)y_step_q4;
+
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    src -= SUBPEL_TAPS / 2 - 1;
+    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+                               x_step_q4, w, h, bd);
+  }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+                                      ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      const int16_t *y_filter_ptr, int w, int h,
+                                      int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      uint16x4_t d1 =
+          highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      uint16x4_t d2 =
+          highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      uint16x4_t d3 =
+          highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      uint16x8_t d01 = vcombine_u16(d0, d1);
+      uint16x8_t d23 = vcombine_u16(d2, d3);
+
+      d01 = vminq_u16(d01, max);
+      d23 = vminq_u16(d23, max);
+
+      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+      vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+      vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+                                                   s7, y_filter);
+        uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+                                                   s8, y_filter);
+        uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+                                                   s9, y_filter);
+        uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+                                                   s10, y_filter);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                    uint8_t *dst8, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    (void)filter_x;
+    (void)x_step_q4;
+
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+    highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+                              bd);
+  }
+}
diff --git a/aom_dsp/arm/highbd_hadamard_neon.c b/aom_dsp/arm/highbd_hadamard_neon.c
index aad2046..d28617c 100644
--- a/aom_dsp/arm/highbd_hadamard_neon.c
+++ b/aom_dsp/arm/highbd_hadamard_neon.c
@@ -109,7 +109,7 @@
   // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
   hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
 
-  transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
 
   // For the second pass we need to widen to 32-bit elements, so we're
   // processing 4 columns at a time.
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 63f53c3..366ca3f 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -15,6 +15,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/intrapred_common.h"
 
 // -----------------------------------------------------------------------------
@@ -191,7 +192,7 @@
     uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above);       \
     uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left);         \
     uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above);                \
-    int sum = horizontal_add_and_broadcast_long_u16x8(sum_vec)[0];      \
+    int sum = horizontal_add_u16x8(sum_vec);                            \
     int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
     highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0));    \
   }
diff --git a/aom_dsp/arm/highbd_loopfilter_neon.c b/aom_dsp/arm/highbd_loopfilter_neon.c
index 2b5128e..77727b7 100644
--- a/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -298,7 +298,7 @@
 
   uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
                         vld1_u16(dst_q1) };
-  transpose_u16_4x4(src);
+  transpose_array_inplace_u16_4x4(src);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -344,7 +344,7 @@
     vget_high_u16(p0q0_output),
     vget_high_u16(p1q1_output),
   };
-  transpose_u16_4x4(output);
+  transpose_array_inplace_u16_4x4(output);
 
   vst1_u16(dst_p1, output[0]);
   vst1_u16(dst_p0, output[1]);
@@ -386,7 +386,7 @@
   // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
   //        ^^^^^^                          ^^^^^^
   // Should dual issue with the left shift.
-  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
   sum = vaddq_u16(sum, outer_sum);
 
@@ -401,7 +401,7 @@
   // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
   //        ^^^^^^^^
   sum = vsubq_u16(sum, p2q2_double);
-  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
 
   *p0q0_output = vrshrq_n_u16(sum, 3);
@@ -505,7 +505,7 @@
   // and src_raw[3] after transpose.
   uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
                             vld1q_u16(dst_2), vld1q_u16(dst_3) };
-  transpose_u16_4x8q(src_raw);
+  transpose_array_inplace_u16_4x8(src_raw);
   // p2, p1, p0, q0, q1, q2
   const uint16x4_t src[6] = {
     vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
@@ -574,7 +574,7 @@
     vget_high_u16(p0q0_output),
     vget_high_u16(p1q1_output),
   };
-  transpose_u16_4x4(output);
+  transpose_array_inplace_u16_4x4(output);
 
   // dst_n starts at p2, so adjust to p1.
   vst1_u16(dst_0 + 1, output[0]);
@@ -626,7 +626,7 @@
 
   // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
   //                                               ^^^^^^
-  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   sum = vaddq_u16(sum, q0p0);
 
   *p2q2_output = vrshrq_n_u16(sum, 3);
@@ -635,7 +635,7 @@
   // p1 = p2 - p3 - p2 + p1 + q1
   // q1 = q2 - q3 - q2 + q0 + p1
   sum = vsubq_u16(sum, p23q23);
-  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
 
   *p1q1_output = vrshrq_n_u16(sum, 3);
@@ -644,7 +644,7 @@
   // p0 = p1 - p3 - p1 + p0 + q2
   // q0 = q1 - q3 - q1 + q0 + p2
   sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
-  const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+  const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
   sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
 
   *p0q0_output = vrshrq_n_u16(sum, 3);
@@ -827,7 +827,7 @@
   uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
   // After transpose, |output| will contain rows of the form:
   // p0 p1 p2 p3 q0 q1 q2 q3
-  transpose_u16_4x8q(output);
+  transpose_array_inplace_u16_4x8(output);
 
   // Reverse p values to produce original order:
   // p3 p2 p1 p0 q0 q1 q2 q3
@@ -883,7 +883,7 @@
   //                                                           ^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //      ^^
-  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   sum = vaddq_u16(sum, q0p0);
 
   *p5q5_output = vrshrq_n_u16(sum, 4);
@@ -892,7 +892,7 @@
   // p4 = p5 - (2 * p6) + p3 + q1
   // q4 = q5 - (2 * q6) + q3 + p1
   sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
-  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
 
   *p4q4_output = vrshrq_n_u16(sum, 4);
@@ -901,7 +901,7 @@
   // p3 = p4 - p6 - p5 + p2 + q2
   // q3 = q4 - q6 - q5 + q2 + p2
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
-  const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+  const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
   sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
 
   *p3q3_output = vrshrq_n_u16(sum, 4);
@@ -910,7 +910,7 @@
   // p2 = p3 - p6 - p4 + p1 + q3
   // q2 = q3 - q6 - q4 + q1 + p3
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
-  const uint16x8_t q3p3 = transpose64_u16q(p3q3);
+  const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
   sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
 
   *p2q2_output = vrshrq_n_u16(sum, 4);
@@ -919,7 +919,7 @@
   // p1 = p2 - p6 - p3 + p0 + q4
   // q1 = q2 - q6 - q3 + q0 + p4
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
-  const uint16x8_t q4p4 = transpose64_u16q(p4q4);
+  const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
   sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
 
   *p1q1_output = vrshrq_n_u16(sum, 4);
@@ -928,7 +928,7 @@
   // p0 = p1 - p6 - p2 + q0 + q5
   // q0 = q1 - q6 - q2 + p0 + p5
   sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
-  const uint16x8_t q5p5 = transpose64_u16q(p5q5);
+  const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
   sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
 
   *p0q0_output = vrshrq_n_u16(sum, 4);
@@ -1118,14 +1118,14 @@
   uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
                           vld1q_u16(dst_3) };
   // p7 will be the low half of src_p[0]. Not used until the end.
-  transpose_u16_4x8q(src_p);
+  transpose_array_inplace_u16_4x8(src_p);
 
   // Low halves:  q0 q1 q2 q3
   // High halves: q4 q5 q6 q7
   uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
                           vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
   // q7 will be the high half of src_q[3]. Not used until the end.
-  transpose_u16_4x8q(src_q);
+  transpose_array_inplace_u16_4x8(src_q);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -1238,10 +1238,10 @@
   const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
   uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
                              p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
-  transpose_u16_4x8q(output_p);
+  transpose_array_inplace_u16_4x8(output_p);
   uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
                              p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
-  transpose_u16_4x8q(output_q);
+  transpose_array_inplace_u16_4x8(output_q);
 
   // Reverse p values to produce original order:
   // p3 p2 p1 p0 q0 q1 q2 q3
diff --git a/aom_dsp/arm/highbd_masked_sad_neon.c b/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 0000000..9262d81
--- /dev/null
+++ b/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+                                             const uint16_t *src,
+                                             const uint16_t *a,
+                                             const uint16_t *b,
+                                             const uint8_t *m) {
+  const uint16x8_t s0 = vld1q_u16(src);
+  const uint16x8_t a0 = vld1q_u16(a);
+  const uint16x8_t b0 = vld1q_u16(b);
+  const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+  uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+  return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+                                              const uint16_t *src,
+                                              const uint16_t *a,
+                                              const uint16_t *b,
+                                              const uint8_t *m) {
+  sad = masked_sad_8x1_neon(sad, src, a, b, m);
+  return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+                                              const uint16_t *src,
+                                              const uint16_t *a,
+                                              const uint16_t *b,
+                                              const uint8_t *m) {
+  sad = masked_sad_16x1_neon(sad, src, a, b, m);
+  return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  do {
+    uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                         vdupq_n_u16(0) };
+    for (int h = 0; h < 4; ++h) {
+      sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+      sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+      sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+      sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+    sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+    sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+    sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+    height -= 4;
+  } while (height != 0);
+
+  sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+  sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+  sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+  return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+    for (int h = 0; h < 4; ++h) {
+      sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+      sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+    sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+    height -= 4;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad = vdupq_n_u16(0);
+    for (int h = 0; h < 4; ++h) {
+      sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad);
+    height -= 4;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+    for (int h = 0; h < 8; ++h) {
+      sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+    height -= 8;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+    for (int h = 0; h < 16; ++h) {
+      sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+    height -= 16;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 128 elements in the
+  // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 128 elements in the
+  // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 16);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 64 elements in the
+  // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 16);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  uint16x4_t sad = vdup_n_u16(0);
+  do {
+    uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+    uint16x4_t a0 = load_unaligned_u16_4x1(a);
+    uint16x4_t b0 = load_unaligned_u16_4x1(b);
+    uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+    uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+    sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h)                                \
+  unsigned int aom_highbd_masked_sad##w##x##h##_neon(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride,  \
+                                           second_pred, w, msk, msk_stride,   \
+                                           h);                                \
+    else                                                                      \
+      return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w,   \
+                                           ref, ref_stride, msk, msk_stride,  \
+                                           h);                                \
+  }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h)                                \
+  unsigned int aom_highbd_masked_sad##w##x##h##_neon(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride,  \
+                                           second_pred, w, msk, msk_stride,   \
+                                           h);                                \
+    else                                                                      \
+      return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w,   \
+                                           ref, ref_stride, msk, msk_stride,  \
+                                           h);                                \
+  }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_sad_neon.c b/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 0000000..28699e6
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+                                                const int32_t *mask,
+                                                const int32_t *wsrc,
+                                                uint32x4_t *sum) {
+  int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+  int32x4_t wsrc_lo = vld1q_s32(wsrc);
+  int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+  int32x4_t mask_lo = vld1q_s32(mask);
+  int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+  int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+  int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+  uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+  uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+  *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+  *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+                                                    int ref_stride,
+                                                    const int32_t *wsrc,
+                                                    const int32_t *mask,
+                                                    int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int h = height / 2;
+  do {
+    uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+    ref_ptr += 2 * ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+                                                    int ref_stride,
+                                                    const int32_t *wsrc,
+                                                    const int32_t *mask,
+                                                    int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t r = vld1q_u16(ref_ptr);
+
+    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+    ref_ptr += ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--height != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+                                                      int ref_stride,
+                                                      const int32_t *wsrc,
+                                                      const int32_t *mask,
+                                                      int width, int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    int i = 0;
+    do {
+      uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+      highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+      uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+      highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+      wsrc += 16;
+      mask += 16;
+      i += 16;
+    } while (i < width);
+
+    ref_ptr += ref_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int height) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+  do {
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+    uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+    uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+    highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+    highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+    highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+    highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+    wsrc += 32;
+    mask += 32;
+    ref_ptr += ref_stride;
+  } while (--height != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+                                                      int ref_stride,
+                                                      const int32_t *wsrc,
+                                                      const int32_t *mask,
+                                                      int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h)                                   \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_neon(                      \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc,           \
+      const int32_t *mask) {                                             \
+    return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+  }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_obmc_variance_neon.c b/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 0000000..d592246
--- /dev/null
+++ b/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     uint32x4_t *sse,
+                                                     int32x4_t *sum) {
+  int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+  int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+  int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+  int32x4_t mask_lo = vld1q_s32(&mask[0]);
+  int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+  int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+  int32x4_t diff_hi =
+      vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+  diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+  diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+  // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+  // This difference only affects the bit patterns at the rounding breakpoints
+  // exactly, so we can add -1 to all negative numbers to move the breakpoint
+  // one value across and into the correct rounding region.
+  diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+  diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+  int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+  int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+  *sum = vaddq_s32(*sum, round_lo);
+  *sum = vaddq_s32(*sum, round_hi);
+  *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+                   vreinterpretq_u32_s32(round_lo));
+  *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+                   vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+    int64_t *sum) {
+  uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  do {
+    uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+    int j = 0;
+
+    do {
+      int i = 0;
+
+      do {
+        uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+        highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+                                          &sum_s32);
+
+        uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+        highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+                                          &sum_s32);
+
+        i += 16;
+        wsrc += 16;
+        mask += 16;
+      } while (i < width);
+
+      pre_ptr += pre_stride;
+      j++;
+    } while (j < h_tmp);
+
+    sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+    sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+    h -= h_tmp;
+  } while (h != 0);
+
+  *sse = horizontal_add_u64x2(sse_u64);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+  uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    int i = 0;
+    do {
+      uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+      highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+      uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+      highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+                                        &sum_s32);
+
+      i += 16;
+      wsrc += 16;
+      mask += 16;
+    } while (i < width);
+
+    pre_ptr += pre_stride;
+  } while (--h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+                                  sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+                                                 int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask, int h,
+                                                 uint64_t *sse, int64_t *sum) {
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    uint16x8_t pre_u16 = vld1q_u16(pre);
+
+    highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+    pre += pre_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+                                                 int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask, int h,
+                                                 uint64_t *sse, int64_t *sum) {
+  assert(h % 2 == 0);
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+    highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+    pre += 2 * pre_stride;
+    wsrc += 8;
+    mask += 8;
+    h -= 2;
+  } while (h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                               int *sum, unsigned int *sse) {
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                                int *sum, unsigned int *sse) {
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                                int *sum, unsigned int *sse) {
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth)                         \
+  unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(         \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,                \
+      const int32_t *mask, unsigned int *sse) {                               \
+    int sum;                                                                  \
+    int64_t sum64;                                                            \
+    uint64_t sse64;                                                           \
+    highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+                                      &sum64);                                \
+    highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse);          \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h));             \
+  }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth)                 \
+  unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(        \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,               \
+      const int32_t *mask, unsigned int *sse) {                              \
+    int sum;                                                                 \
+    int64_t sum64;                                                           \
+    uint64_t sse64;                                                          \
+    highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+                                             &sse64, &sum64);                \
+    highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse);         \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h));            \
+  }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/aom_dsp/arm/highbd_quantize_neon.c b/aom_dsp/arm/highbd_quantize_neon.c
index 77a7aac..6149c9f 100644
--- a/aom_dsp/arm/highbd_quantize_neon.c
+++ b/aom_dsp/arm/highbd_quantize_neon.c
@@ -11,14 +11,11 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <string.h>
 
 #include "config/aom_config.h"
 
 #include "aom_dsp/quantize.h"
-#include "aom_dsp/arm/mem_neon.h"
-
-#include "av1/common/quant_common.h"
-#include "av1/encoder/av1_quantize.h"
 
 static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
 #if AOM_ARCH_AARCH64
@@ -83,6 +80,7 @@
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE void get_min_max_lane_eob(const int16_t *iscan,
                                         int16x8_t *v_eobmin,
                                         int16x8_t *v_eobmax, uint16x8_t v_mask,
@@ -91,13 +89,14 @@
   const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
 #if SKIP_EOB_FACTOR_ADJUST
   const int16x8_t v_nz_iscan_min =
-      vbslq_s16(v_mask, v_iscan, vdupq_n_s16(n_coeffs));
+      vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
   *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
 #else
   (void)v_eobmin;
 #endif
   *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
@@ -117,6 +116,7 @@
 #endif
 }
 
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
 static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
 #if AOM_ARCH_AARCH64
   return (uint16_t)vminvq_s16(v_eobmin);
@@ -134,6 +134,7 @@
   return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
 #endif
 }
+#endif  // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
 
 static void highbd_quantize_b_neon(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
@@ -298,7 +299,7 @@
   int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-  int16x8_t v_eobmin = vdupq_n_s16(n_coeffs);
+  int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
 
   assert(n_coeffs > 8);
   // Pre-scan pass
diff --git a/aom_dsp/arm/highbd_sad4d_neon.c b/aom_dsp/arm/highbd_sad4d_neon.c
deleted file mode 100644
index f2fda36..0000000
--- a/aom_dsp/arm/highbd_sad4d_neon.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/sum_neon.h"
-
-static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  int i = 0;
-  do {
-    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
-    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
-    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
-    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
-    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
-
-    sum[0] = vabal_u16(sum[0], s, r0);
-    sum[1] = vabal_u16(sum[1], s, r1);
-    sum[2] = vabal_u16(sum[2], s, r2);
-    sum[3] = vabal_u16(sum[3], s, r3);
-
-  } while (++i < h);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-  uint32x4_t sum_u32[4];
-
-  int i = 0;
-  do {
-    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
-
-    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
-    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
-    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
-    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
-
-  } while (++i < h);
-
-  sum_u32[0] = vpaddlq_u16(sum[0]);
-  sum_u32[1] = vpaddlq_u16(sum[1]);
-  sum_u32[2] = vpaddlq_u16(sum[2]);
-  sum_u32[3] = vpaddlq_u16(sum[3]);
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
-}
-
-static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
-                             uint32x4_t *const sad_sum) {
-  uint16x8_t abs_diff = vabdq_u16(src, ref);
-  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
-}
-
-static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  int i = 0;
-  do {
-    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
-    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
-    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
-    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
-    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
-
-  } while (++i < h);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
-                                                int src_stride,
-                                                const uint8_t *const ref_ptr[4],
-                                                int ref_stride, uint32_t res[4],
-                                                int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int i = 0;
-  do {
-    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
-    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
-    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
-    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
-    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
-
-    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
-    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
-    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
-    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
-    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
-
-  } while (++i < h);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
-                                               int src_stride,
-                                               const uint8_t *const ref_ptr[4],
-                                               int ref_stride, uint32_t res[4],
-                                               int w, int h) {
-  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
-  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
-  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
-  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
-  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
-
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int i = 0;
-  do {
-    int j = 0;
-    do {
-      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
-      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
-      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
-      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
-      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
-
-      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
-      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
-      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
-      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
-      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
-
-      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
-      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
-                &sum_lo[0]);
-      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
-                &sum_lo[1]);
-      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
-                &sum_lo[2]);
-      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
-                &sum_lo[3]);
-
-      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
-      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
-                &sum_hi[0]);
-      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
-                &sum_hi[1]);
-      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
-                &sum_hi[2]);
-      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
-                &sum_hi[3]);
-
-      j += 32;
-    } while (j < w);
-
-  } while (++i < h);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void highbd_sad128xhx4d_large_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
-    int ref_stride, uint32_t res[4], int h) {
-  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
-                              128, h);
-}
-
-static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
-                                                int src_stride,
-                                                const uint8_t *const ref_ptr[4],
-                                                int ref_stride, uint32_t res[4],
-                                                int h) {
-  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
-                              h);
-}
-
-static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
-                                                int src_stride,
-                                                const uint8_t *const ref_ptr[4],
-                                                int ref_stride, uint32_t res[4],
-                                                int h) {
-  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
-                              h);
-}
-
-#define HBD_SAD_WXH_4D_SMALL_NEON(w, h)                                      \
-  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride,  \
-                                    sad_array, (h));                         \
-  }
-
-#define HBD_SAD_WXH_4D_LARGE_NEON(w, h)                                      \
-  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride,  \
-                                    sad_array, (h));                         \
-  }
-
-HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
-HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
-
-HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
-HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
-HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
-
-HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
-
-HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
-HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
-HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
-
-HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
-HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
-HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
-
-HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
-HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
-
-#if !CONFIG_REALTIME_ONLY
-HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
-
-HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
-
-HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
-HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
-
-HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
-
-HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
-#endif  // !CONFIG_REALTIME_ONLY
-
-#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h)                                 \
-  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array,          \
-                                    2 * ref_stride, sad_array, ((h) >> 1));  \
-    sad_array[0] <<= 1;                                                      \
-    sad_array[1] <<= 1;                                                      \
-    sad_array[2] <<= 1;                                                      \
-    sad_array[3] <<= 1;                                                      \
-  }
-
-#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h)                                 \
-  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
-      int ref_stride, uint32_t sad_array[4]) {                               \
-    highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array,          \
-                                    2 * ref_stride, sad_array, ((h) >> 1));  \
-    sad_array[0] <<= 1;                                                      \
-    sad_array[1] <<= 1;                                                      \
-    sad_array[2] <<= 1;                                                      \
-    sad_array[3] <<= 1;                                                      \
-  }
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
-
-#if !CONFIG_REALTIME_ONLY
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
-
-HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
-
-HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
-#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index 919eb55..d51f639 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -61,6 +61,7 @@
   return horizontal_add_u16x8(sum);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
                                                 int src_stride,
                                                 const uint8_t *ref_ptr,
@@ -82,6 +83,7 @@
 
   return horizontal_add_u32x4(sum_u32);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
                                                  int src_stride,
@@ -283,3 +285,225 @@
 
 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t aom_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
+  }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sadxd_neon.c b/aom_dsp/arm/highbd_sadxd_neon.c
new file mode 100644
index 0000000..85ca673
--- /dev/null
+++ b/aom_dsp/arm/highbd_sadxd_neon.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+  } while (++i < h);
+
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad128xhx4d_large_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+    int ref_stride, uint32_t res[4], int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+                              128, h);
+}
+
+static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+                              h);
+}
+
+static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+                              h);
+}
+
+#define HBD_SAD_WXH_4D_SMALL_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+#define HBD_SAD_WXH_4D_LARGE_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h)                                 \
+  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array,          \
+                                    2 * ref_stride, sad_array, ((h) >> 1));  \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h)                                 \
+  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array,          \
+                                    2 * ref_stride, sad_array, ((h) >> 1));  \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+  res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+  res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+    uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+    uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+    uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+  res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+  res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum[3];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+    int ref_stride, uint32_t res[4], int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+                              128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+                              h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+                              h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x3d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x3d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_sse_neon.c b/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000..184e9f9
--- /dev/null
+++ b/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int height) {
+  uint32x4_t sse[16];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+  highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+    highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      uint16x8_t abs_diff = vabdq_u16(s, r);
+      uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+      uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+      uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_u32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/aom_dsp/arm/highbd_subpel_variance_neon.c b/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000..bdbbf70
--- /dev/null
+++ b/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, blend);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+                                               uint16_t *dst_ptr,
+                                               int src_stride, int pixel_step,
+                                               int dst_height,
+                                               int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 128, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
+  uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+                                                                              \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
+                                                                              \
+    return aom_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                   \
+  unsigned int                                                                \
+      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(      \
+          const uint8_t *src, int src_stride, int xoffset, int yoffset,       \
+          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,     \
+          const uint8_t *msk, int msk_stride, int invert_mask,                \
+          unsigned int *sse) {                                                \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * (h + 1)];                                               \
+    uint16_t tmp2[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);         \
+    aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w,  \
+                                   h, CONVERT_TO_BYTEPTR(tmp1), w, msk,       \
+                                   msk_stride, invert_mask);                  \
+    return aom_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                   \
+  }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)        \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
+          const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
+          const uint8_t *msk, int msk_stride, int invert_mask,                 \
+          unsigned int *sse) {                                                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp0[w * h];                                                    \
+      if (yoffset == 0) {                                                      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred,  \
+                                       w, h, src, src_stride, msk, msk_stride, \
+                                       invert_mask);                           \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride,   \
+                                      w, h);                                   \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride,          \
+                                           src_stride, h, yoffset);            \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      if (yoffset == 0) {                                                      \
+        uint16_t tmp0[w * h];                                                  \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp0[w * (h + 1)];                                            \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp0[w * (h + 1)];                                            \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                \
+  unsigned int                                                              \
+      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(      \
+          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,     \
+          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {    \
+    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                           \
+    uint16_t tmp0[w * (h + 1)];                                             \
+    uint16_t tmp1[w * h];                                                   \
+    highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+                                       xoffset);                            \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);       \
+    return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(           \
+        CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                      \
+  }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)       \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(         \
+          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,        \
+          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {       \
+    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                              \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            pre, pre_stride, wsrc, mask, sse);                                 \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+                                      h);                                      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride,           \
+                                           pre_stride, h, yoffset);            \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h);     \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h,    \
+                                           xoffset);                           \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
+                                           h + 1, xoffset);                    \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
+                                           h + 1, xoffset);                    \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                                     int src_stride, int dst_width,
+                                     int dst_height,
+                                     const uint16_t *second_pred,
+                                     const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+  const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+    vst1_u16(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)              \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+          const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+          const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
+          const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1,     \
+                                       xoffset);                               \
+    highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                      \
+        tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                      \
+    return aom_highbd_##bitdepth##_variance##w##x##h(                          \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);                \
+  }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)  \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+          const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+          const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
+          const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second,        \
+                                 jcp_param);                                   \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      } else if (yoffset == 4) {                                               \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
+            src, tmp, source_stride, source_stride, w, h, second, jcp_param);  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      } else {                                                                 \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second,        \
+            jcp_param);                                                        \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
+            src, tmp0, source_stride, 1, w, h, second, jcp_param);             \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
+                                                        h, second, jcp_param); \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            src, tmp0, source_stride, 1, h, xoffset, second, jcp_param);       \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+                                           xoffset);                           \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
+                                                        h, second, jcp_param); \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+                                           xoffset);                           \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/highbd_variance_neon.c b/aom_dsp/arm/highbd_variance_neon.c
index 948f2f7..e54fc18 100644
--- a/aom_dsp/arm/highbd_variance_neon.c
+++ b/aom_dsp/arm/highbd_variance_neon.c
@@ -15,10 +15,10 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
 
 // Process a block of width 4 two rows at a time.
 static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
@@ -412,67 +412,6 @@
   return *sse;
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
-                                            int src_stride,
-                                            const uint16_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            unsigned int *sse) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h / 2;
-  do {
-    uint16x8_t s0 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    uint16x8_t s1 = vld1q_u16(src_ptr);
-    src_ptr += src_stride;
-    uint16x8_t r0 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-    uint16x8_t r1 = vld1q_u16(ref_ptr);
-    ref_ptr += ref_stride;
-
-    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    uint8x16_t diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-  } while (--i != 0);
-
-  *sse = horizontal_add_u32x4(sse_u32);
-  return *sse;
-}
-
-static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
-                                             int src_stride,
-                                             const uint16_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             unsigned int *sse) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint16x8_t s0 = vld1q_u16(src_ptr);
-    uint16x8_t s1 = vld1q_u16(src_ptr + 8);
-    uint16x8_t r0 = vld1q_u16(ref_ptr);
-    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
-
-    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
-    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
-
-    uint8x16_t diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, diff, diff);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  *sse = horizontal_add_u32x4(sse_u32);
-  return *sse;
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
                                             int src_stride,
                                             const uint16_t *ref_ptr,
@@ -491,8 +430,6 @@
                              sse);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define HIGHBD_MSE_WXH_NEON(w, h)                                       \
   uint32_t aom_highbd_8_mse##w##x##h##_neon(                            \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,   \
@@ -529,3 +466,55 @@
 HIGHBD_MSE_WXH_NEON(8, 8)
 
 #undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+                                                uint16x8_t s1, uint16x8_t d0,
+                                                uint16x8_t d1) {
+  uint16x8_t e0 = vabdq_u16(s0, d0);
+  uint16x8_t e1 = vabdq_u16(s1, d1);
+
+  uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+  mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+  mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+  mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+  return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  if (w == 8) {
+    do {
+      uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+      uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+      uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+      uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+      sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+      dst += 2 * dstride;
+      src += 2 * sstride;
+      h -= 2;
+    } while (h != 0);
+  } else {  // w == 4
+    do {
+      uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+      uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+      sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+      dst += 4 * dstride;
+      src += 4 * sstride;
+      h -= 4;
+    } while (h != 0);
+  }
+
+  return horizontal_add_u64x2(sum);
+}
diff --git a/aom_dsp/arm/highbd_variance_neon_dotprod.c b/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000..d56ae97
--- /dev/null
+++ b/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h,
+                                                    unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    uint16x8_t s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    uint16x8_t r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h,
+                                                     unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0 = vld1q_u16(src_ptr);
+    uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                 \
+  uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+                                     sse);                                \
+    return *sse;                                                          \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 2161378..41f070e 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -24,7 +24,7 @@
 // DC 4x4
 
 static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
-  const uint8x8_t a = load_u8_4x1_lane0(in);
+  const uint8x8_t a = load_u8_4x1(in);
   const uint16x4_t p0 = vpaddl_u8(a);
   const uint16x4_t p1 = vpadd_u16(p0, p0);
   return vcombine_u16(p1, vdup_n_u16(0));
@@ -354,7 +354,7 @@
 
 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  uint8x8_t a = load_u8_4x1_lane0(above);
+  uint8x8_t a = load_u8_4x1(above);
   uint8x8_t l = vld1_u8(left);
   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
   uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
@@ -364,7 +364,7 @@
 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   uint8x8_t a = vld1_u8(above);
-  uint8x8_t l = load_u8_4x1_lane0(left);
+  uint8x8_t l = load_u8_4x1(left);
   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
   uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
   dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
@@ -372,7 +372,7 @@
 
 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  uint8x8_t a = load_u8_4x1_lane0(above);
+  uint8x8_t a = load_u8_4x1(above);
   uint8x16_t l = vld1q_u8(left);
   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
   uint32_t sum = horizontal_add_u16x8(sum_al);
@@ -383,7 +383,7 @@
 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   uint8x16_t a = vld1q_u8(above);
-  uint8x8_t l = load_u8_4x1_lane0(left);
+  uint8x8_t l = load_u8_4x1(left);
   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
   uint32_t sum = horizontal_add_u16x8(sum_al);
   uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
@@ -620,7 +620,7 @@
 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  v_store_4xh(dst, stride, 4, load_u8_4x1_lane0(above));
+  v_store_4xh(dst, stride, 4, load_u8_4x1(above));
 }
 
 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -646,13 +646,13 @@
 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  v_store_4xh(dst, stride, 8, load_u8_4x1_lane0(above));
+  v_store_4xh(dst, stride, 8, load_u8_4x1(above));
 }
 
 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)left;
-  v_store_4xh(dst, stride, 16, load_u8_4x1_lane0(above));
+  v_store_4xh(dst, stride, 16, load_u8_4x1(above));
 }
 
 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -856,7 +856,7 @@
 
 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  const uint8x8_t d0 = load_u8_4x1(left);
   (void)above;
   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0);
   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0);
@@ -907,7 +907,7 @@
 
 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  const uint8x8_t d0 = load_u8_4x1(left);
   (void)above;
   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
@@ -936,7 +936,7 @@
 
 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  const uint8x8_t d0 = load_u8_4x1(left);
   (void)above;
   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
@@ -1594,8 +1594,10 @@
       base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
 
 #if AOM_ARCH_AARCH64
-      uint8x8_t left_idx0 = vreinterpret_u8_s16(base_y_c64 + 2);  // [0, 16]
-      uint8x8_t left_idx1 = vreinterpret_u8_s16(base_y_c64 + 3);  // [1, 17]
+      uint8x8_t left_idx0 =
+          vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2)));  // [0, 16]
+      uint8x8_t left_idx1 =
+          vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3)));  // [1, 17]
 
       uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8);
       uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8);
@@ -1777,8 +1779,10 @@
       base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
 
 #if AOM_ARCH_AARCH64
-      uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c128 + 2);  // [0, 33]
-      uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c128 + 3);  // [1, 34]
+      uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+          vaddq_s16(base_y_c128, vdupq_n_s16(2)));  // [0, 33]
+      uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+          vaddq_s16(base_y_c128, vdupq_n_s16(3)));  // [1, 34]
       uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
 
       uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
@@ -2025,8 +2029,10 @@
 
 #if AOM_ARCH_AARCH64
           // Values in left_idx{0,1} range from 0 through 63 inclusive.
-          uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c256.val[0] + 1);
-          uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c256.val[1] + 1);
+          uint8x16_t left_idx0 = vreinterpretq_u8_s16(
+              vaddq_s16(base_y_c256.val[0], vdupq_n_s16(1)));
+          uint8x16_t left_idx1 = vreinterpretq_u8_s16(
+              vaddq_s16(base_y_c256.val[1], vdupq_n_s16(1)));
 
           uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
 
@@ -3168,12 +3174,10 @@
   const uint8_t bottom_left = left_column[height - 1];
   const uint8_t *const weights_y = smooth_weights + height - 4;
 
-  uint8x8_t UNINITIALIZED_IS_SAFE(top_v);
-  load_u8_4x1(top_row, &top_v, 0);
+  uint8x8_t top_v = load_u8_4x1(top_row);
   const uint8x8_t top_right_v = vdup_n_u8(top_right);
   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
-  uint8x8_t UNINITIALIZED_IS_SAFE(weights_x_v);
-  load_u8_4x1(smooth_weights, &weights_x_v, 0);
+  uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
   const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
 
@@ -3403,9 +3407,9 @@
     const uint8_t bottom_left = left_column[height - 1];              \
     const uint8_t *const weights_y = smooth_weights + height - 4;     \
                                                                       \
-    uint8x8_t UNINITIALIZED_IS_SAFE(top_v);                           \
+    uint8x8_t top_v;                                                  \
     if ((W) == 4) {                                                   \
-      load_u8_4x1(top_row, &top_v, 0);                                \
+      top_v = load_u8_4x1(top_row);                                   \
     } else { /* width == 8 */                                         \
       top_v = vld1_u8(top_row);                                       \
     }                                                                 \
@@ -3717,9 +3721,9 @@
                                        int width, int height) {
   const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
-  uint8x8_t UNINITIALIZED_IS_SAFE(top);
+  uint8x8_t top;
   if (width == 4) {
-    load_u8_4x1(top_row, &top, 0);
+    top = load_u8_4x1(top_row);
   } else {  // width == 8
     top = vld1_u8(top_row);
   }
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index 8fc7ccb..0e683a7 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -634,13 +634,13 @@
   p6p2 = vget_low_u8(row1);
   p5p1 = vget_low_u8(row2);
   p4p0 = vget_low_u8(row3);
-  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+  transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
 
   q0q4 = vget_high_u8(row0);
   q1q5 = vget_high_u8(row1);
   q2q6 = vget_high_u8(row2);
   q3qy = vget_high_u8(row3);
-  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+  transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
 
   pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
   pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
@@ -679,13 +679,13 @@
   q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
   q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
   q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
-  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+  transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
 
   pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
   p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
   p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
   p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
-  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+  transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
 
   row0 = vcombine_u8(pxp3, q0q4);
   row1 = vcombine_u8(p6p2, q1q5);
@@ -725,7 +725,7 @@
   // row3: p3 p2 p1 p0 | q0 q1 q2 q3
   load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
 
-  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
 
   pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
   p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
@@ -750,7 +750,7 @@
   p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
   p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
   p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
-  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
 
   store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }
@@ -784,7 +784,7 @@
   // row3: px p2 p1 p0 | q0 q1 q2 qy
   load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
 
-  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+  transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
 
   pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
   pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
@@ -809,7 +809,7 @@
   p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
   p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
   pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
-  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+  transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
 
   store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
 }
@@ -834,7 +834,7 @@
                              const uint8_t *limit, const uint8_t *thresh) {
   uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
   uint32x2_t pq_rev;
-  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), UNINITIALIZED_IS_SAFE(q0q1);
+  uint8x8_t p1p0, q0q1;
   uint8x8_t p0q0, p1q1;
 
   // row0: p1 p0 | q0 q1
@@ -843,7 +843,7 @@
   // row3: p1 p0 | q0 q1
   load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
 
-  transpose_u8_4x4(&p1p0, &q0q1);
+  transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
 
   p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
 
@@ -860,7 +860,7 @@
   p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
   q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
 
-  transpose_u8_4x4(&p1p0, &q0q1);
+  transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
 
   store_unaligned_u8_4x1(src - 2, p1p0, 0);
   store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
@@ -886,25 +886,13 @@
 
 void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1),
-      UNINITIALIZED_IS_SAFE(p2q2), UNINITIALIZED_IS_SAFE(p3q3),
-      UNINITIALIZED_IS_SAFE(p4q4), UNINITIALIZED_IS_SAFE(p5q5),
-      UNINITIALIZED_IS_SAFE(p6q6);
-
-  load_u8_4x1(src - 7 * stride, &p6q6, 0);
-  load_u8_4x1(src - 6 * stride, &p5q5, 0);
-  load_u8_4x1(src - 5 * stride, &p4q4, 0);
-  load_u8_4x1(src - 4 * stride, &p3q3, 0);
-  load_u8_4x1(src - 3 * stride, &p2q2, 0);
-  load_u8_4x1(src - 2 * stride, &p1q1, 0);
-  load_u8_4x1(src - 1 * stride, &p0q0, 0);
-  load_u8_4x1(src + 0 * stride, &p0q0, 1);
-  load_u8_4x1(src + 1 * stride, &p1q1, 1);
-  load_u8_4x1(src + 2 * stride, &p2q2, 1);
-  load_u8_4x1(src + 3 * stride, &p3q3, 1);
-  load_u8_4x1(src + 4 * stride, &p4q4, 1);
-  load_u8_4x1(src + 5 * stride, &p5q5, 1);
-  load_u8_4x1(src + 6 * stride, &p6q6, 1);
+  uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+  uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+  uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+  uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+  uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+  uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+  uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
 
   lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
               *thresh);
@@ -1036,12 +1024,8 @@
 
 void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1);
-
-  load_u8_4x1(src - 2 * stride, &p1q1, 0);
-  load_u8_4x1(src - 1 * stride, &p0q0, 0);
-  load_u8_4x1(src + 0 * stride, &p0q0, 1);
-  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+  uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+  uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
 
   lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
 
diff --git a/aom_dsp/arm/masked_sad4d_neon.c b/aom_dsp/arm/masked_sad4d_neon.c
index 98daeda..8f65b80 100644
--- a/aom_dsp/arm/masked_sad4d_neon.c
+++ b/aom_dsp/arm/masked_sad4d_neon.c
@@ -516,19 +516,18 @@
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
-#define MASKED_SAD4D_WXH_NEON(w, h)                                           \
-  void aom_masked_sad##w##x##h##x4d_neon(                                     \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int invert_mask, uint32_t res[4]) {                     \
-    if (invert_mask) {                                                        \
-      return masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,  \
-                                           second_pred, msk, msk_stride, res, \
-                                           h);                                \
-    } else {                                                                  \
-      return masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,      \
-                                       second_pred, msk, msk_stride, res, h); \
-    }                                                                         \
+#define MASKED_SAD4D_WXH_NEON(w, h)                                            \
+  void aom_masked_sad##w##x##h##x4d_neon(                                      \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int invert_mask, uint32_t res[4]) {                      \
+    if (invert_mask) {                                                         \
+      masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,          \
+                                    second_pred, msk, msk_stride, res, h);     \
+    } else {                                                                   \
+      masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+                                msk, msk_stride, res, h);                      \
+    }                                                                          \
   }
 
 MASKED_SAD4D_WXH_NEON(4, 8)
diff --git a/aom_dsp/arm/masked_sad_neon.c b/aom_dsp/arm/masked_sad_neon.c
index 340df05..9d26310 100644
--- a/aom_dsp/arm/masked_sad_neon.c
+++ b/aom_dsp/arm/masked_sad_neon.c
@@ -15,9 +15,10 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/blend.h"
-#include "mem_neon.h"
-#include "sum_neon.h"
 
 static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
                                               const uint8_t *src,
@@ -29,15 +30,7 @@
   uint8x16_t b0 = vld1q_u8(b);
   uint8x16_t s0 = vld1q_u8(src);
 
-  uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
-  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
-  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
-  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
-
-  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
-  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
-  uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+  uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
 
   return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
 }
@@ -164,10 +157,7 @@
     uint8x8_t b0 = vld1_u8(b);
     uint8x8_t s0 = vld1_u8(src);
 
-    uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-    uint16x8_t blend_u16 = vmull_u8(m0, a0);
-    blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
-    uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+    uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
 
     sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
 
@@ -199,10 +189,7 @@
     uint8x8_t b0 = load_unaligned_u8(b, b_stride);
     uint8x8_t s0 = load_unaligned_u8(src, src_stride);
 
-    uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
-    uint16x8_t blend_u16 = vmull_u8(m0, a0);
-    blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
-    uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+    uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
 
     sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
 
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 16d44c5..d1ac648 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -43,6 +43,11 @@
   return res;
 }
 
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+  uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+  return res;
+}
+
 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
@@ -85,18 +90,31 @@
   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
 }
 
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define load_u8_4x1(s, s0, lane)                                           \
-  do {                                                                     \
-    *(s0) = vreinterpret_u8_u32(                                           \
-        vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
-  } while (0)
-
 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
-static INLINE uint8x8_t load_u8_4x1_lane0(const uint8_t *p) {
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
   uint8x8_t ret = vdup_n_u8(0);
-  load_u8_4x1(p, &ret, 0);
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+  return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+  uint8x8_t ret = vdup_n_u8(0);
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+  p += stride;
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+  return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+  uint16x4_t ret = vdup_n_u16(0);
+  ret = vreinterpret_u16_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+  p += stride;
+  ret = vreinterpret_u16_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
   return ret;
 }
 
@@ -214,6 +232,38 @@
   s += p;
 }
 
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+                                 int16x4_t *const s0, int16x4_t *const s1,
+                                 int16x4_t *const s2, int16x4_t *const s3,
+                                 int16x4_t *const s4, int16x4_t *const s5,
+                                 int16x4_t *const s6, int16x4_t *const s7,
+                                 int16x4_t *const s8, int16x4_t *const s9,
+                                 int16x4_t *const s10, int16x4_t *const s11) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+  s += p;
+  *s8 = vld1_s16(s);
+  s += p;
+  *s9 = vld1_s16(s);
+  s += p;
+  *s10 = vld1_s16(s);
+  s += p;
+  *s11 = vld1_s16(s);
+}
+
 static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
                                  int16x4_t *const s0, int16x4_t *const s1,
                                  int16x4_t *const s2, int16x4_t *const s3,
@@ -316,6 +366,23 @@
   *s6 = vld1_s16(s);
 }
 
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+}
+
 static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
                                 int16x4_t *const s0, int16x4_t *const s1,
                                 int16x4_t *const s2, int16x4_t *const s3,
@@ -592,6 +659,33 @@
   *s10 = vld1_u8(s);
 }
 
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+}
+
 static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
                                  int16x8_t *const s0, int16x8_t *const s1,
                                  int16x8_t *const s2, int16x8_t *const s3,
@@ -622,6 +716,38 @@
   *s10 = vld1q_s16(s);
 }
 
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9,
+                                 int16x8_t *const s10, int16x8_t *const s11) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+  s += p;
+  *s10 = vld1q_s16(s);
+  s += p;
+  *s11 = vld1q_s16(s);
+}
+
 static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
                                  uint16x8_t *const s0, uint16x8_t *const s1,
                                  uint16x8_t *const s2, uint16x8_t *const s3,
@@ -714,6 +840,23 @@
   *s6 = vld1q_s16(s);
 }
 
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+}
+
 static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1,
                                 int16x8_t *const s2, int16x8_t *const s3,
@@ -793,6 +936,24 @@
   return vreinterpret_u8_u32(a_u32);
 }
 
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(a);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+  uint16_t a;
+  uint16x4_t a_u32;
+
+  memcpy(&a, buf, 2);
+  a_u32 = vdup_n_u16(a);
+  return vreinterpret_u8_u16(a_u32);
+}
+
 static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
   uint32_t a;
   uint32x2_t a_u32;
@@ -844,6 +1005,20 @@
     memcpy(dst, &a, 2);                                \
   } while (0)
 
+#define store_unaligned_u16_2x1(dst, src, lane)         \
+  do {                                                  \
+    uint32_t a;                                         \
+    a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+    memcpy(dst, &a, 4);                                 \
+  } while (0)
+
+#define store_unaligned_u16_4x1(dst, src, lane)           \
+  do {                                                    \
+    uint64_t a;                                           \
+    a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+    memcpy(dst, &a, 8);                                   \
+  } while (0)
+
 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3,
@@ -917,6 +1092,27 @@
   *s7 = vld1q_u16(s + 8);
 }
 
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+                                                int stride) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vdup_n_u32(a);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
 static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
                                                 uint32_t stride) {
   uint64_t a;
@@ -1004,4 +1200,32 @@
   vst1q_s32(buf, v0);
 }
 
+static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride,
+                                          uint8x8_t src) {
+  store_unaligned_u8_2x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u8_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride,
+                                          uint8x8_t src) {
+  store_unaligned_u8_4x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u8_4x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride,
+                                           uint16x4_t src) {
+  store_unaligned_u16_2x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u16_2x1(dst, src, 1);
+}
+
+static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride,
+                                           uint16x8_t src) {
+  store_unaligned_u16_4x1(dst, src, 0);
+  dst += dst_stride;
+  store_unaligned_u16_4x1(dst, src, 1);
+}
+
 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 60efef8..46a1666 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -15,93 +15,10 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int w, int h) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      diff0 = vabdq_u8(s0, r0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      diff1 = vabdq_u8(s1, r1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
-                                         const uint8_t *ref_ptr, int ref_stride,
-                                         int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
-}
-
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
-}
-
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
-}
-
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    diff0 = vabdq_u8(s0, r0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    diff1 = vabdq_u8(s1, r1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *ref_ptr, int ref_stride,
                                          int h) {
@@ -220,28 +137,25 @@
 static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         int h) {
-  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
 
   int i = h;
   do {
     uint8x16_t s0 = vld1q_u8(src_ptr);
     uint8x16_t r0 = vld1q_u8(ref_ptr);
     uint8x16_t diff0 = vabdq_u8(s0, r0);
-    uint16x8_t sum0 = vpaddlq_u8(diff0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
 
     uint8x16_t s1 = vld1q_u8(src_ptr + 16);
     uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
     uint8x16_t diff1 = vabdq_u8(s1, r1);
-    uint16x8_t sum1 = vpaddlq_u8(diff1);
-
-    sum = vpadalq_u16(sum, sum0);
-    sum = vpadalq_u16(sum, sum1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
   } while (--i != 0);
 
-  return horizontal_add_u32x4(sum);
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
 static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -264,8 +178,6 @@
   return horizontal_add_u16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
@@ -384,114 +296,6 @@
 
 #undef SAD_SKIP_WXH_NEON
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
-                                           int src_stride,
-                                           const uint8_t *ref_ptr,
-                                           int ref_stride, int w, int h,
-                                           const uint8_t *second_pred) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      p0 = vld1q_u8(second_pred);
-      avg0 = vrhaddq_u8(r0, p0);
-      diff0 = vabdq_u8(s0, avg0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      p1 = vld1q_u8(second_pred + 16);
-      avg1 = vrhaddq_u8(r1, p1);
-      diff1 = vabdq_u8(s1, avg1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-      second_pred += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
-                                             int src_stride,
-                                             const uint8_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    p0 = vld1q_u8(second_pred);
-    avg0 = vrhaddq_u8(r0, p0);
-    diff0 = vabdq_u8(s0, avg0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    p1 = vld1q_u8(second_pred);
-    avg1 = vrhaddq_u8(r1, p1);
-    diff1 = vabdq_u8(s1, avg1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
                                              int src_stride,
                                              const uint8_t *ref_ptr,
@@ -644,7 +448,7 @@
                                             const uint8_t *ref_ptr,
                                             int ref_stride, int h,
                                             const uint8_t *second_pred) {
-  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
 
   int i = h;
   do {
@@ -653,24 +457,21 @@
     uint8x16_t p0 = vld1q_u8(second_pred);
     uint8x16_t avg0 = vrhaddq_u8(r0, p0);
     uint8x16_t diff0 = vabdq_u8(s0, avg0);
-    uint16x8_t sum0 = vpaddlq_u8(diff0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
 
     uint8x16_t s1 = vld1q_u8(src_ptr + 16);
     uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
     uint8x16_t p1 = vld1q_u8(second_pred + 16);
     uint8x16_t avg1 = vrhaddq_u8(r1, p1);
     uint8x16_t diff1 = vabdq_u8(s1, avg1);
-    uint16x8_t sum1 = vpaddlq_u8(diff1);
-
-    sum = vpadalq_u16(sum, sum0);
-    sum = vpadalq_u16(sum, sum1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
     second_pred += 32;
   } while (--i != 0);
 
-  return horizontal_add_u32x4(sum);
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
 }
 
 static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
@@ -698,8 +499,6 @@
   return horizontal_add_u16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *ref_ptr,
@@ -788,3 +587,287 @@
 #endif  // !CONFIG_REALTIME_ONLY
 
 #undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+    uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+    uint8x16_t p4 = vld1q_u8(second_pred + 64);
+    uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+    uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+    uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+    uint8x16_t p5 = vld1q_u8(second_pred + 80);
+    uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+    uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+    uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+    uint8x16_t p6 = vld1q_u8(second_pred + 96);
+    uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+    uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+    uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+    uint8x16_t p7 = vld1q_u8(second_pred + 112);
+    uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+    uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--h != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--h != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+    uint8x16_t diff = vabdq_u8(s, wtd_avg);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+    sum = vabal_u8(sum, s, wtd_avg);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+    sum = vabal_u8(sum, s, wtd_avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h)                                        \
+  unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+                                        second_pred, jcp_param);               \
+  }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000..5504c68
--- /dev/null
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+                                                 int src_stride,
+                                                 const uint8_t *ref_ptr,
+                                                 int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int aom_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int aom_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  unsigned int aom_sad##w##x##h##_avg_neon_dotprod(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  // We use 8 accumulators to minimize the accumulation and loop carried
+  // dependencies for better instruction throughput.
+  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+    uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+    uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+    uint8x16_t p4 = vld1q_u8(second_pred + 64);
+    uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+    uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+    sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+    uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+    uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+    uint8x16_t p5 = vld1q_u8(second_pred + 80);
+    uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+    uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+    sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+    uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+    uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+    uint8x16_t p6 = vld1q_u8(second_pred + 96);
+    uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+    uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+    sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+    uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+    uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+    uint8x16_t p7 = vld1q_u8(second_pred + 112);
+    uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+    uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+    sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[4] = vaddq_u32(sum[4], sum[5]);
+  sum[6] = vaddq_u32(sum[6], sum[7]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+  sum[4] = vaddq_u32(sum[4], sum[6]);
+  sum[0] = vaddq_u32(sum[0], sum[4]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    uint8x16_t s1 = vld1q_u8(src_ptr);
+    uint8x16_t r1 = vld1q_u8(ref_ptr);
+    uint8x16_t p1 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h)                               \
+  unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod(                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    return dist_wtd_sad##w##xh_avg_neon_dotprod(                              \
+        src, src_stride, ref, ref_stride, (h), second_pred, jcp_param);       \
+  }
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD
diff --git a/aom_dsp/arm/sadxd_neon.c b/aom_dsp/arm/sadxd_neon.c
index 81803b1..e89e1c5 100644
--- a/aom_dsp/arm/sadxd_neon.c
+++ b/aom_dsp/arm/sadxd_neon.c
@@ -18,90 +18,6 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
-                              uint32x4_t *const sad_sum) {
-  uint8x16_t abs_diff = vabdq_u8(src, ref);
-  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
-}
-
-static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
-                                        const uint8_t *const ref[4],
-                                        int ref_stride, uint32_t res[4], int w,
-                                        int h) {
-  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
-  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      const uint8x16_t s0 = vld1q_u8(src + j);
-      sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
-      sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
-      sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
-
-      const uint8x16_t s1 = vld1q_u8(src + j + 16);
-      sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
-      sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
-      sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
-
-      j += 32;
-    } while (j < w);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-  } while (--i != 0);
-
-  res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
-  res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
-  res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
-}
-
-static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *const ref[4], int ref_stride,
-                                    uint32_t res[4], int h) {
-  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h);
-}
-
-static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h);
-}
-
-static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 32, h);
-}
-
-static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    const uint8x16_t s = vld1q_u8(src);
-    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
-    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
-    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-  } while (--i != 0);
-
-  res[0] = horizontal_add_u32x4(sum[0]);
-  res[1] = horizontal_add_u32x4(sum[1]);
-  res[2] = horizontal_add_u32x4(sum[2]);
-}
-
-#else  // !(defined(__ARM_FEATURE_DOTPROD))
-
 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
                               uint16x8_t *const sad_sum) {
   uint8x16_t abs_diff = vabdq_u8(src, ref);
@@ -218,8 +134,6 @@
   res[2] = horizontal_add_u16x8(sum[2]);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[3], int ref_stride,
                                   uint32_t res[3], int h) {
@@ -325,92 +239,6 @@
 
 #undef SAD_WXH_3D_NEON
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
-                                        const uint8_t *const ref[4],
-                                        int ref_stride, uint32_t res[4], int w,
-                                        int h) {
-  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                           vdupq_n_u32(0) };
-  uint32x4_t sum[4];
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      const uint8x16_t s0 = vld1q_u8(src + j);
-      sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
-      sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
-      sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
-      sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
-
-      const uint8x16_t s1 = vld1q_u8(src + j + 16);
-      sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
-      sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
-      sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
-      sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
-
-      j += 32;
-    } while (j < w);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-  } while (--i != 0);
-
-  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
-  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
-  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
-  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *const ref[4], int ref_stride,
-                                    uint32_t res[4], int h) {
-  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h);
-}
-
-static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h);
-}
-
-static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 32, h);
-}
-
-static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref[4], int ref_stride,
-                                   uint32_t res[4], int h) {
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  int ref_offset = 0;
-  int i = h;
-  do {
-    const uint8x16_t s = vld1q_u8(src);
-    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
-    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
-    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
-    sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
-
-    src += src_stride;
-    ref_offset += ref_stride;
-  } while (--i != 0);
-
-  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
-}
-
-#else  // !(defined(__ARM_FEATURE_DOTPROD))
-
 static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
                                         const uint8_t *const ref[4],
                                         int ref_stride, uint32_t res[4], int w,
@@ -534,8 +362,6 @@
   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref[4], int ref_stride,
                                   uint32_t res[4], int h) {
diff --git a/aom_dsp/arm/sadxd_neon_dotprod.c b/aom_dsp/arm/sadxd_neon_dotprod.c
new file mode 100644
index 0000000..3d11d1c
--- /dev/null
+++ b/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
+                                                int src_stride,
+                                                const uint8_t *const ref[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int w, int h) {
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s0 = vld1q_u8(src + j);
+      sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+      const uint8x16_t s1 = vld1q_u8(src + j + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+      j += 32;
+    } while (j < w);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+  res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+  res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *const ref[4],
+                                            int ref_stride, uint32_t res[4],
+                                            int h) {
+  sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src);
+    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON_DOTPROD(w, h)                                         \
+  void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \
+                                          const uint8_t *const ref[4],        \
+                                          int ref_stride, uint32_t res[4]) {  \
+    sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h));   \
+  }
+
+SAD_WXH_3D_NEON_DOTPROD(16, 8)
+SAD_WXH_3D_NEON_DOTPROD(16, 16)
+SAD_WXH_3D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_3D_NEON_DOTPROD(32, 16)
+SAD_WXH_3D_NEON_DOTPROD(32, 32)
+SAD_WXH_3D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_3D_NEON_DOTPROD(64, 32)
+SAD_WXH_3D_NEON_DOTPROD(64, 64)
+SAD_WXH_3D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_3D_NEON_DOTPROD(128, 64)
+SAD_WXH_3D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON_DOTPROD(16, 4)
+SAD_WXH_3D_NEON_DOTPROD(16, 64)
+SAD_WXH_3D_NEON_DOTPROD(32, 8)
+SAD_WXH_3D_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON_DOTPROD
+
+static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
+                                                int src_stride,
+                                                const uint8_t *const ref[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int w, int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s0 = vld1q_u8(src + j);
+      sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+      sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+      const uint8x16_t s1 = vld1q_u8(src + j + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+      sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *const ref[4],
+                                            int ref_stride, uint32_t res[4],
+                                            int h) {
+  sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src);
+    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h)                                         \
+  void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \
+                                          const uint8_t *const ref[4],        \
+                                          int ref_stride, uint32_t res[4]) {  \
+    sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h));   \
+  }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_WXH_4D_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                                    \
+  void aom_sad_skip_##w##x##h##x4d_neon_dotprod(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref[4],        \
+      int ref_stride, uint32_t res[4]) {                                      \
+    sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+                               ((h) >> 1));                                   \
+    res[0] <<= 1;                                                             \
+    res[1] <<= 1;                                                             \
+    res[2] <<= 1;                                                             \
+    res[3] <<= 1;                                                             \
+  }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index d1d3d93..ec8f0ee 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -11,119 +11,8 @@
 #include <arm_neon.h>
 
 #include "config/aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
-
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
-                                 uint32x4_t *sse) {
-  uint8x16_t s = vld1q_u8(src);
-  uint8x16_t r = vld1q_u8(ref);
-
-  uint8x16_t abs_diff = vabdq_u8(s, r);
-
-  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
-}
-
-static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
-                                uint32x2_t *sse) {
-  uint8x8_t s = vld1_u8(src);
-  uint8x8_t r = vld1_u8(ref);
-
-  uint8x8_t abs_diff = vabd_u8(s, r);
-
-  *sse = vdot_u32(*sse, abs_diff, abs_diff);
-}
-
-static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
-                                const uint8_t *ref, int ref_stride,
-                                uint32x2_t *sse) {
-  uint8x8_t s = load_unaligned_u8(src, src_stride);
-  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
-
-  uint8x8_t abs_diff = vabd_u8(s, r);
-
-  *sse = vdot_u32(*sse, abs_diff, abs_diff);
-}
-
-static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    int height) {
-  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
-
-  int i = height;
-  do {
-    sse_8x1_neon(src, ref, &sse[0]);
-    src += src_stride;
-    ref += ref_stride;
-    sse_8x1_neon(src, ref, &sse[1]);
-    src += src_stride;
-    ref += ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
-}
-
-static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    int height) {
-  uint32x2_t sse = vdup_n_u32(0);
-
-  int i = height;
-  do {
-    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
-
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  return horizontal_add_u32x2(sse);
-}
-
-static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    int width, int height) {
-  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
-
-  if ((width & 0x07) && ((width & 0x07) < 5)) {
-    int i = height;
-    do {
-      int j = 0;
-      do {
-        sse_8x1_neon(src + j, ref + j, &sse[0]);
-        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse[1]);
-        j += 8;
-      } while (j + 4 < width);
-
-      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse[0]);
-      src += 2 * src_stride;
-      ref += 2 * ref_stride;
-      i -= 2;
-    } while (i != 0);
-  } else {
-    int i = height;
-    do {
-      int j = 0;
-      do {
-        sse_8x1_neon(src + j, ref + j, &sse[0]);
-        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse[1]);
-        j += 8;
-      } while (j < width);
-
-      src += 2 * src_stride;
-      ref += 2 * ref_stride;
-      i -= 2;
-    } while (i != 0);
-  }
-  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
                                  uint32x4_t *sse) {
@@ -159,39 +48,6 @@
   *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
 }
 
-static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    int height) {
-  uint32x4_t sse = vdupq_n_u32(0);
-
-  int i = height;
-  do {
-    sse_8x1_neon(src, ref, &sse);
-
-    src += src_stride;
-    ref += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(sse);
-}
-
-static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    int height) {
-  uint32x4_t sse = vdupq_n_u32(0);
-
-  int i = height;
-  do {
-    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
-
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  return horizontal_add_u32x4(sse);
-}
-
 static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     int width, int height) {
@@ -228,8 +84,6 @@
   return horizontal_add_u32x4(sse);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
                                       const uint8_t *ref, int ref_stride,
                                       int height) {
@@ -308,6 +162,39 @@
   return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
 }
 
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(sse);
+}
+
 int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
                      int ref_stride, int width, int height) {
   switch (width) {
@@ -321,268 +208,3 @@
       return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
   }
 }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static INLINE uint32_t highbd_sse_W8x1_neon(uint16x8_t q2, uint16x8_t q3) {
-  uint32_t sse;
-  const uint32_t sse1 = 0;
-  const uint32x4_t q1 = vld1q_dup_u32(&sse1);
-
-  uint16x8_t q4 = vabdq_u16(q2, q3);  // diff = abs(a[x] - b[x])
-  uint16x4_t d0 = vget_low_u16(q4);
-  uint16x4_t d1 = vget_high_u16(q4);
-
-  uint32x4_t q6 = vmlal_u16(q1, d0, d0);
-  uint32x4_t q7 = vmlal_u16(q1, d1, d1);
-
-  uint32x2_t d4 = vadd_u32(vget_low_u32(q6), vget_high_u32(q6));
-  uint32x2_t d5 = vadd_u32(vget_low_u32(q7), vget_high_u32(q7));
-
-  uint32x2_t d6 = vadd_u32(d4, d5);
-
-  sse = vget_lane_u32(d6, 0);
-  sse += vget_lane_u32(d6, 1);
-
-  return sse;
-}
-
-int64_t aom_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
-  static const uint16_t k01234567[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-  const uint16x8_t q0 = vld1q_u16(k01234567);
-  int64_t sse = 0;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int x, y;
-  int addinc;
-  uint16x4_t d0, d1, d2, d3;
-  uint16_t dx;
-  uint16x8_t q2, q3, q4, q5;
-
-  switch (width) {
-    case 4:
-      for (y = 0; y < height; y += 2) {
-        d0 = vld1_u16(a);  // load 4 data
-        a += a_stride;
-        d1 = vld1_u16(a);
-        a += a_stride;
-
-        d2 = vld1_u16(b);
-        b += b_stride;
-        d3 = vld1_u16(b);
-        b += b_stride;
-        q2 = vcombine_u16(d0, d1);  // make a 8 data vector
-        q3 = vcombine_u16(d2, d3);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-      }
-      break;
-    case 8:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u16(a);
-        q3 = vld1q_u16(b);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        a += a_stride;
-        b += b_stride;
-      }
-      break;
-    case 16:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u16(a);
-        q3 = vld1q_u16(b);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 8);
-        q3 = vld1q_u16(b + 8);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        a += a_stride;
-        b += b_stride;
-      }
-      break;
-    case 32:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u16(a);
-        q3 = vld1q_u16(b);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 8);
-        q3 = vld1q_u16(b + 8);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 16);
-        q3 = vld1q_u16(b + 16);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 24);
-        q3 = vld1q_u16(b + 24);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        a += a_stride;
-        b += b_stride;
-      }
-      break;
-    case 64:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u16(a);
-        q3 = vld1q_u16(b);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 8);
-        q3 = vld1q_u16(b + 8);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 16);
-        q3 = vld1q_u16(b + 16);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 24);
-        q3 = vld1q_u16(b + 24);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 32);
-        q3 = vld1q_u16(b + 32);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 40);
-        q3 = vld1q_u16(b + 40);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 48);
-        q3 = vld1q_u16(b + 48);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 56);
-        q3 = vld1q_u16(b + 56);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        a += a_stride;
-        b += b_stride;
-      }
-      break;
-    case 128:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u16(a);
-        q3 = vld1q_u16(b);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 8);
-        q3 = vld1q_u16(b + 8);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 16);
-        q3 = vld1q_u16(b + 16);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 24);
-        q3 = vld1q_u16(b + 24);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 32);
-        q3 = vld1q_u16(b + 32);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 40);
-        q3 = vld1q_u16(b + 40);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 48);
-        q3 = vld1q_u16(b + 48);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 56);
-        q3 = vld1q_u16(b + 56);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 64);
-        q3 = vld1q_u16(b + 64);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 72);
-        q3 = vld1q_u16(b + 72);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 80);
-        q3 = vld1q_u16(b + 80);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 88);
-        q3 = vld1q_u16(b + 88);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 96);
-        q3 = vld1q_u16(b + 96);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 104);
-        q3 = vld1q_u16(b + 104);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 112);
-        q3 = vld1q_u16(b + 112);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-
-        q2 = vld1q_u16(a + 120);
-        q3 = vld1q_u16(b + 120);
-
-        sse += highbd_sse_W8x1_neon(q2, q3);
-        a += a_stride;
-        b += b_stride;
-      }
-      break;
-    default:
-
-      for (y = 0; y < height; y++) {
-        x = width;
-        while (x > 0) {
-          addinc = width - x;
-          q2 = vld1q_u16(a + addinc);
-          q3 = vld1q_u16(b + addinc);
-          if (x < 8) {
-            dx = x;
-            q4 = vld1q_dup_u16(&dx);
-            q5 = vcltq_u16(q0, q4);
-            q2 = vandq_u16(q2, q5);
-            q3 = vandq_u16(q3, q5);
-          }
-          sse += highbd_sse_W8x1_neon(q2, q3);
-          x -= 8;
-        }
-        a += a_stride;
-        b += b_stride;
-      }
-  }
-  return (int64_t)sse;
-}
-#endif
diff --git a/aom_dsp/arm/sse_neon_dotprod.c b/aom_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000..9790497
--- /dev/null
+++ b/aom_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                         uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+    sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
+    sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
+    sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
+    sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x2(sse);
+}
+
+int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int width,
+                             int height) {
+  switch (width) {
+    case 4:
+      return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+                                  height);
+  }
+}
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 9599ae0..2e6e738 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -18,6 +18,7 @@
 #include "aom/aom_integer.h"
 
 #include "aom_dsp/variance.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
@@ -154,59 +155,58 @@
     return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
-#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
-  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                        \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
-    if (xoffset == 0) {                                                       \
-      if (yoffset == 0) {                                                     \
-        return aom_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
-                                            sse);                             \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp[w * h];                                                   \
-        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
-        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
-      } else {                                                                \
-        uint8_t tmp[w * h];                                                   \
-        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
-                                    yoffset);                                 \
-        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
-      }                                                                       \
-    } else if (xoffset == 4) {                                                \
-      uint8_t tmp0[w * (h + padding)];                                        \
-      if (yoffset == 0) {                                                     \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
-        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp1[w * (h + padding)];                                      \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
-        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      } else {                                                                \
-        uint8_t tmp1[w * (h + padding)];                                      \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
-        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      }                                                                       \
-    } else {                                                                  \
-      uint8_t tmp0[w * (h + padding)];                                        \
-      if (yoffset == 0) {                                                     \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
-        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp1[w * h];                                                  \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
-                                    xoffset);                                 \
-        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      } else {                                                                \
-        uint8_t tmp1[w * h];                                                  \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
-                                    xoffset);                                 \
-        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      }                                                                       \
-    }                                                                         \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                       \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
+    if (xoffset == 0) {                                                      \
+      if (yoffset == 0) {                                                    \
+        return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      } else {                                                               \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
+                                    yoffset);                                \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      }                                                                      \
+    } else if (xoffset == 4) {                                               \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    } else {                                                                 \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    }                                                                        \
   }
 
 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
@@ -279,6 +279,36 @@
   } while (i != 0);
 }
 
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 4.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                                uint8_t *dst_ptr, int src_stride,
@@ -307,6 +337,35 @@
   } while (--i > 0);
 }
 
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 8.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint8x8_t p = vld1_u8(second_pred);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
+}
+
 // Combine bilinear filter with aom_comp_avg_pred for large blocks.
 static void avg_pred_var_filter_block2d_bil_large(
     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
@@ -342,6 +401,43 @@
   } while (--i != 0);
 }
 
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
 static void avg_pred_var_filter_block2d_bil_w16(
     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
@@ -378,6 +474,46 @@
                                         filter_offset, second_pred);
 }
 
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
 // Combine averaging subpel filter with aom_comp_avg_pred.
 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
                                             uint8_t *dst_ptr, int src_stride,
@@ -409,6 +545,37 @@
   } while (--i != 0);
 }
 
+// Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
+static void dist_wtd_avg_pred_var_filter_block2d_avg(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
 // Implementation of aom_comp_avg_pred for blocks having width >= 16.
 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
                      int dst_width, int dst_height,
@@ -436,6 +603,36 @@
   } while (--i != 0);
 }
 
+// Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
+static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                              int src_stride, int dst_width, int dst_height,
+                              const uint8_t *second_pred,
+                              const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
   unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                  \
       const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
@@ -459,53 +656,53 @@
       uint8_t tmp[w * h];                                                      \
       if (yoffset == 0) {                                                      \
         avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
-        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       } else if (yoffset == 4) {                                               \
         avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
                                         source_stride, w, h, second_pred);     \
-        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       } else {                                                                 \
         avg_pred_var_filter_block2d_bil_w##w(                                  \
             src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
-        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
       uint8_t tmp0[w * (h + padding)];                                         \
       if (yoffset == 0) {                                                      \
         avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
                                         second_pred);                          \
-        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * (h + padding)];                                       \
         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * (h + padding)];                                       \
         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
                                              second_pred);                     \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     } else {                                                                   \
       uint8_t tmp0[w * (h + padding)];                                         \
       if (yoffset == 0) {                                                      \
         avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
                                              xoffset, second_pred);            \
-        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
                                     (h + padding), xoffset);                   \
         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
                                     (h + padding), xoffset);                   \
         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
                                              second_pred);                     \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     }                                                                          \
   }
@@ -550,6 +747,125 @@
 #undef SUBPEL_AVG_VARIANCE_WXH_NEON
 #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
 
+#define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon(         \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {  \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                          \
+        tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);              \
+    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
+
+#define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)       \
+  unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon(            \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred,          \
+                          jcp_param);                                          \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride,      \
+                                                 source_stride, w, h,          \
+                                                 second_pred, jcp_param);      \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else {                                                                 \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred,   \
+            jcp_param);                                                        \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        dist_wtd_avg_pred_var_filter_block2d_avg(                              \
+            src, tmp0, source_stride, 1, w, h, second_pred, jcp_param);        \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h,       \
+                                                 second_pred, jcp_param);      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);             \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param);  \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h,       \
+                                                 second_pred, jcp_param);      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);             \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+
 #if !CONFIG_REALTIME_ONLY
 
 #define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
@@ -665,7 +981,7 @@
     var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);                 \
     aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
                             invert_mask);                                      \
-    return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);        \
+    return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);               \
   }
 
 #define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)             \
@@ -679,20 +995,20 @@
       if (yoffset == 0) {                                                      \
         aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h);       \
         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h,      \
                                     yoffset);                                  \
         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
       uint8_t tmp0[w * (h + padding)];                                         \
@@ -701,7 +1017,7 @@
         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);                \
         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * h];                                                   \
         uint8_t tmp2[w * h];                                                   \
@@ -709,7 +1025,7 @@
         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * h];                                                   \
         uint8_t tmp2[w * h];                                                   \
@@ -717,7 +1033,7 @@
         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
       }                                                                        \
     } else {                                                                   \
       if (yoffset == 0) {                                                      \
@@ -726,7 +1042,7 @@
         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);     \
         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp0[w * (h + padding)];                                       \
         uint8_t tmp1[w * h];                                                   \
@@ -736,7 +1052,7 @@
         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp0[w * (h + padding)];                                       \
         uint8_t tmp1[w * (h + padding)];                                       \
@@ -746,7 +1062,7 @@
         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
                                 msk_stride, invert_mask);                      \
-        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
       }                                                                        \
     }                                                                          \
   }
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index ff68c12..b5a8b97 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
+#define AOM_AOM_DSP_ARM_SUM_NEON_H_
+
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_config.h"
 
@@ -62,7 +65,16 @@
 #endif
 }
 
-static INLINE unsigned int horizontal_add_u32x4(const uint32x4_t a) {
+static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
 #if AOM_ARCH_AARCH64
   return vaddvq_u32(a);
 #else
@@ -88,6 +100,21 @@
 #endif
 }
 
+static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+  int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
+  int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
+  return vpaddq_s32(res01, res23);
+#else
+  int32x4_t res = vdupq_n_s32(0);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
 static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
                                                  const uint16x8_t vec_hi) {
 #if AOM_ARCH_AARCH64
@@ -186,3 +213,72 @@
   return vget_lane_u32(vreinterpret_u32_u64(c), 0);
 #endif
 }
+
+static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
+#if AOM_ARCH_AARCH64
+  return vpaddq_s32(a, b);
+#else
+  const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+  const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+  return vcombine_s32(a0, b0);
+#endif
+}
+
+static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vget_low_s32(vpaddq_s32(a, a));
+#else
+  return vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
+  return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
+  uint64x2_t sum = vpaddlq_u32(a[0]);
+  sum = vpadalq_u32(sum, a[1]);
+  sum = vpadalq_u32(sum, a[2]);
+  sum = vpadalq_u32(sum, a[3]);
+
+  return horizontal_add_u64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+
+  return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+  sum[0] = vpadalq_u32(sum[0], a[8]);
+  sum[1] = vpadalq_u32(sum[1], a[9]);
+  sum[0] = vpadalq_u32(sum[0], a[10]);
+  sum[1] = vpadalq_u32(sum[1], a[11]);
+  sum[0] = vpadalq_u32(sum[0], a[12]);
+  sum[1] = vpadalq_u32(sum[1], a[13]);
+  sum[0] = vpadalq_u32(sum[0], a[14]);
+  sum[1] = vpadalq_u32(sum[1], a[15]);
+
+  return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+#endif  // AOM_AOM_DSP_ARM_SUM_NEON_H_
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index 626cf21..424b2b4 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -287,130 +287,6 @@
   return aom_sum_squares_i16_c(src, n);
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
-                                              int width, int height) {
-  uint64_t sum = 0;
-  uint64_t sse = 0;
-  uint32x2_t sum_u32 = vdup_n_u32(0);
-  uint32x2_t sse_u32 = vdup_n_u32(0);
-
-  int h = height / 2;
-  do {
-    int w = width;
-    uint8_t *src_ptr = src;
-    do {
-      uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
-
-      sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
-
-      sse_u32 = vdot_u32(sse_u32, s0, s0);
-
-      src_ptr += 8;
-      w -= 8;
-    } while (w >= 8);
-
-    // Process remaining columns in the row using C.
-    while (w > 0) {
-      int idx = width - w;
-      const uint8_t v = src[idx];
-      sum += v;
-      sse += v * v;
-      w--;
-    }
-
-    src += 2 * src_stride;
-  } while (--h != 0);
-
-  sum += horizontal_long_add_u32x2(sum_u32);
-  sse += horizontal_long_add_u32x2(sse_u32);
-
-  return sse - sum * sum / (width * height);
-}
-
-static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
-                                              int width, int height) {
-  uint64_t sum = 0;
-  uint64_t sse = 0;
-  uint32x2_t sum_u32 = vdup_n_u32(0);
-  uint32x2_t sse_u32 = vdup_n_u32(0);
-
-  int h = height;
-  do {
-    int w = width;
-    uint8_t *src_ptr = src;
-    do {
-      uint8x8_t s0 = vld1_u8(src_ptr);
-
-      sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
-
-      sse_u32 = vdot_u32(sse_u32, s0, s0);
-
-      src_ptr += 8;
-      w -= 8;
-    } while (w >= 8);
-
-    // Process remaining columns in the row using C.
-    while (w > 0) {
-      int idx = width - w;
-      const uint8_t v = src[idx];
-      sum += v;
-      sse += v * v;
-      w--;
-    }
-
-    src += src_stride;
-  } while (--h != 0);
-
-  sum += horizontal_long_add_u32x2(sum_u32);
-  sse += horizontal_long_add_u32x2(sse_u32);
-
-  return sse - sum * sum / (width * height);
-}
-
-static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
-                                               int width, int height) {
-  uint64_t sum = 0;
-  uint64_t sse = 0;
-  uint32x4_t sum_u32 = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int h = height;
-  do {
-    int w = width;
-    uint8_t *src_ptr = src;
-    do {
-      uint8x16_t s0 = vld1q_u8(src_ptr);
-
-      sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
-
-      sse_u32 = vdotq_u32(sse_u32, s0, s0);
-
-      src_ptr += 16;
-      w -= 16;
-    } while (w >= 16);
-
-    // Process remaining columns in the row using C.
-    while (w > 0) {
-      int idx = width - w;
-      const uint8_t v = src[idx];
-      sum += v;
-      sse += v * v;
-      w--;
-    }
-
-    src += src_stride;
-  } while (--h != 0);
-
-  sum += horizontal_long_add_u32x4(sum_u32);
-  sse += horizontal_long_add_u32x4(sse_u32);
-
-  return sse - sum * sum / (width * height);
-}
-
-#else  //  !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
                                               int width, int height) {
   uint64_t sum = 0;
@@ -584,8 +460,6 @@
   return sse - sum * sum / (width * height);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width,
                             int height) {
   if (width >= 16) {
diff --git a/aom_dsp/arm/sum_squares_neon_dotprod.c b/aom_dsp/arm/sum_squares_neon_dotprod.c
new file mode 100644
index 0000000..44462a6
--- /dev/null
+++ b/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
+                                                      int src_stride, int width,
+                                                      int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint32x2_t sse_u32 = vdup_n_u32(0);
+
+  int h = height / 2;
+  do {
+    int w = width;
+    uint8_t *src_ptr = src;
+    do {
+      uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+      sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+      sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+      src_ptr += 8;
+      w -= 8;
+    } while (w >= 8);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint8_t v = src[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src += 2 * src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_long_add_u32x2(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
+                                                      int src_stride, int width,
+                                                      int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint32x2_t sse_u32 = vdup_n_u32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    uint8_t *src_ptr = src;
+    do {
+      uint8x8_t s0 = vld1_u8(src_ptr);
+
+      sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+      sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+      src_ptr += 8;
+      w -= 8;
+    } while (w >= 8);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint8_t v = src[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src += src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_long_add_u32x2(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
+                                                       int src_stride,
+                                                       int width, int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    uint8_t *src_ptr = src;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+
+      sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
+
+      sse_u32 = vdotq_u32(sse_u32, s0, s0);
+
+      src_ptr += 16;
+      w -= 16;
+    } while (w >= 16);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint8_t v = src[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src += src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x4(sum_u32);
+  sse += horizontal_long_add_u32x4(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width,
+                                    int height) {
+  if (width >= 16) {
+    return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height);
+  }
+  if (width >= 8) {
+    return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height);
+  }
+  if (width >= 4 && height % 2 == 0) {
+    return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height);
+  }
+  return aom_var_2d_u8_c(src, src_stride, width, height);
+}
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 8218140..b215f6a 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -13,16 +13,14 @@
 
 #include <arm_neon.h>
 
+#include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
 #include "config/aom_config.h"
 
-// Swap high and low halves.
-static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) {
-  return vextq_u16(a, a, 4);
-}
-
-static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
-                                    uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
-                                    uint8x8_t *a6, uint8x8_t *a7) {
+static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
+                                                  uint8x8_t *a2, uint8x8_t *a3,
+                                                  uint8x8_t *a4, uint8x8_t *a5,
+                                                  uint8x8_t *a6,
+                                                  uint8x8_t *a7) {
   // Swap 8 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
@@ -74,8 +72,9 @@
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
-static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
-                                    uint8x8_t *a3) {
+static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
+                                                  uint8x8_t *a2,
+                                                  uint8x8_t *a3) {
   // Swap 8 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
@@ -107,7 +106,8 @@
   *a3 = vreinterpret_u8_u16(c1.val[1]);
 }
 
-static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
+                                                  uint8x8_t *a1) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03  10 11 12 13
   // a1: 20 21 22 23  30 31 32 33
@@ -136,10 +136,12 @@
   *a1 = d0.val[1];
 }
 
-static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
-                                    uint8x8_t *a3, const uint8x8_t a4,
-                                    const uint8x8_t a5, const uint8x8_t a6,
-                                    const uint8x8_t a7) {
+static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
+                                          uint8x8_t a2, uint8x8_t a3,
+                                          uint8x8_t a4, uint8x8_t a5,
+                                          uint8x8_t a6, uint8x8_t a7,
+                                          uint8x8_t *o0, uint8x8_t *o1,
+                                          uint8x8_t *o2, uint8x8_t *o3) {
   // Swap 32 bit elements. Goes from:
   // a0: 00 01 02 03 XX XX XX XX
   // a1: 10 11 12 13 XX XX XX XX
@@ -156,13 +158,13 @@
   // b3.val[0]: 30 31 32 33 70 71 72 73
 
   const uint32x2x2_t b0 =
-      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+      vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
   const uint32x2x2_t b1 =
-      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+      vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
   const uint32x2x2_t b2 =
-      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+      vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
   const uint32x2x2_t b3 =
-      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+      vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
 
   // Swap 16 bit elements resulting in:
   // c0.val[0]: 00 01 20 21 40 41 60 61
@@ -186,23 +188,19 @@
   const uint8x8x2_t d1 =
       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
 
-  *a0 = d0.val[0];
-  *a1 = d0.val[1];
-  *a2 = d1.val[0];
-  *a3 = d1.val[1];
+  *o0 = d0.val[0];
+  *o1 = d0.val[1];
+  *o2 = d1.val[0];
+  *o3 = d1.val[1];
 }
 
-// Input:
-// 00 01 02 03
-// 10 11 12 13
-// 20 21 22 23
-// 30 31 32 33
-// Output:
-// 00 10 20 30
-// 01 11 21 31
-// 02 12 22 32
-// 03 13 23 33
-static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
+static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
+  // Input:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
   // b:
   // 00 10 02 12
   // 01 11 03 13
@@ -221,23 +219,25 @@
   // 03 13 23 33
   const uint32x2x2_t e =
       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+
+  // Output:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
   a[0] = vreinterpret_u16_u32(d.val[0]);
   a[1] = vreinterpret_u16_u32(e.val[0]);
   a[2] = vreinterpret_u16_u32(d.val[1]);
   a[3] = vreinterpret_u16_u32(e.val[1]);
 }
 
-// 4x8 Input:
-// a[0]: 00 01 02 03 04 05 06 07
-// a[1]: 10 11 12 13 14 15 16 17
-// a[2]: 20 21 22 23 24 25 26 27
-// a[3]: 30 31 32 33 34 35 36 37
-// 8x4 Output:
-// a[0]: 00 10 20 30 04 14 24 34
-// a[1]: 01 11 21 31 05 15 25 35
-// a[2]: 02 12 22 32 06 16 26 36
-// a[3]: 03 13 23 33 07 17 27 37
-static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
+static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
+  // 4x8 Input:
+  // a[0]: 00 01 02 03 04 05 06 07
+  // a[1]: 10 11 12 13 14 15 16 17
+  // a[2]: 20 21 22 23 24 25 26 27
+  // a[3]: 30 31 32 33 34 35 36 37
+
   // b0.val[0]: 00 10 02 12 04 14 06 16
   // b0.val[1]: 01 11 03 13 05 15 07 17
   // b1.val[0]: 20 30 22 32 24 34 26 36
@@ -254,6 +254,11 @@
   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
                                     vreinterpretq_u32_u16(b1.val[1]));
 
+  // 8x4 Output:
+  // a[0]: 00 10 20 30 04 14 24 34
+  // a[1]: 01 11 21 31 05 15 25 35
+  // a[2]: 02 12 22 32 06 16 26 36
+  // a[3]: 03 13 23 33 07 17 27 37
   a[0] = vreinterpretq_u16_u32(c0.val[0]);
   a[1] = vreinterpretq_u16_u32(c1.val[0]);
   a[2] = vreinterpretq_u16_u32(c0.val[1]);
@@ -345,12 +350,11 @@
   a[3] = d0.val[0];  // p3q3
 }
 
-static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
-                                     uint16x4_t *a2, uint16x4_t *a3,
-                                     uint16x4_t *a4, uint16x4_t *a5,
-                                     uint16x4_t *a6, uint16x4_t *a7,
-                                     uint16x8_t *o0, uint16x8_t *o1,
-                                     uint16x8_t *o2, uint16x8_t *o3) {
+static INLINE void transpose_elems_u16_4x8(
+    const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
+    const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
+    const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
+    uint16x8_t *o2, uint16x8_t *o3) {
   // Combine rows. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
@@ -366,10 +370,10 @@
   // b2: 20 21 22 23 60 61 62 63
   // b3: 30 31 32 33 70 71 72 73
 
-  const uint16x8_t b0 = vcombine_u16(*a0, *a4);
-  const uint16x8_t b1 = vcombine_u16(*a1, *a5);
-  const uint16x8_t b2 = vcombine_u16(*a2, *a6);
-  const uint16x8_t b3 = vcombine_u16(*a3, *a7);
+  const uint16x8_t b0 = vcombine_u16(a0, a4);
+  const uint16x8_t b1 = vcombine_u16(a1, a5);
+  const uint16x8_t b2 = vcombine_u16(a2, a6);
+  const uint16x8_t b3 = vcombine_u16(a3, a7);
 
   // Swap 16 bit elements resulting in:
   // c0.val[0]: 00 10 02 12 40 50 42 52
@@ -397,12 +401,11 @@
   *o3 = vreinterpretq_u16_u32(d1.val[1]);
 }
 
-static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
-                                     int16x4_t *a2, int16x4_t *a3,
-                                     int16x4_t *a4, int16x4_t *a5,
-                                     int16x4_t *a6, int16x4_t *a7,
-                                     int16x8_t *o0, int16x8_t *o1,
-                                     int16x8_t *o2, int16x8_t *o3) {
+static INLINE void transpose_elems_s16_4x8(
+    const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
+    const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
+    const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
+    int16x8_t *o2, int16x8_t *o3) {
   // Combine rows. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
@@ -418,10 +421,10 @@
   // b2: 20 21 22 23 60 61 62 63
   // b3: 30 31 32 33 70 71 72 73
 
-  const int16x8_t b0 = vcombine_s16(*a0, *a4);
-  const int16x8_t b1 = vcombine_s16(*a1, *a5);
-  const int16x8_t b2 = vcombine_s16(*a2, *a6);
-  const int16x8_t b3 = vcombine_s16(*a3, *a7);
+  const int16x8_t b0 = vcombine_s16(a0, a4);
+  const int16x8_t b1 = vcombine_s16(a1, a5);
+  const int16x8_t b2 = vcombine_s16(a2, a6);
+  const int16x8_t b3 = vcombine_s16(a3, a7);
 
   // Swap 16 bit elements resulting in:
   // c0.val[0]: 00 10 02 12 40 50 42 52
@@ -449,10 +452,9 @@
   *o3 = vreinterpretq_s16_s32(d1.val[1]);
 }
 
-static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
-                                     uint16x8_t *a2, uint16x8_t *a3,
-                                     uint16x8_t *a4, uint16x8_t *a5,
-                                     uint16x8_t *a6, uint16x8_t *a7) {
+static INLINE void transpose_elems_inplace_u16_8x8(
+    uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
+    uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
@@ -537,10 +539,11 @@
   return b0;
 }
 
-static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
-                                     int16x8_t *a2, int16x8_t *a3,
-                                     int16x8_t *a4, int16x8_t *a5,
-                                     int16x8_t *a6, int16x8_t *a7) {
+static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
@@ -609,7 +612,8 @@
   *a7 = d3.val[1];
 }
 
-static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
+                                            int16x8_t *out) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
@@ -678,8 +682,10 @@
   out[7] = d3.val[1];
 }
 
-static INLINE void transpose_u16_4x4d(uint16x4_t *a0, uint16x4_t *a1,
-                                      uint16x4_t *a2, uint16x4_t *a3) {
+static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
+                                                   uint16x4_t *a1,
+                                                   uint16x4_t *a2,
+                                                   uint16x4_t *a3) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
@@ -711,8 +717,9 @@
   *a3 = vreinterpret_u16_u32(c1.val[1]);
 }
 
-static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
-                                      int16x4_t *a2, int16x4_t *a3) {
+static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
+                                                   int16x4_t *a2,
+                                                   int16x4_t *a3) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
@@ -758,8 +765,12 @@
   return b0;
 }
 
-static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
-                                     int32x4_t *a2, int32x4_t *a3) {
+static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
+                                           const int32x4_t a1,
+                                           const int32x4_t a2,
+                                           const int32x4_t a3, int32x4_t *o0,
+                                           int32x4_t *o1, int32x4_t *o2,
+                                           int32x4_t *o3) {
   // Swap 32 bit elements. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
@@ -771,8 +782,8 @@
   // b1.val[0]: 20 30 22 32
   // b1.val[1]: 21 31 23 33
 
-  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
-  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+  const int32x4x2_t b0 = vtrnq_s32(a0, a1);
+  const int32x4x2_t b1 = vtrnq_s32(a2, a3);
 
   // Swap 64 bit elements resulting in:
   // c0.val[0]: 00 10 20 30
@@ -783,10 +794,267 @@
   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
 
-  *a0 = c0.val[0];
-  *a1 = c1.val[0];
-  *a2 = c0.val[1];
-  *a3 = c1.val[1];
+  *o0 = c0.val[0];
+  *o1 = c1.val[0];
+  *o2 = c0.val[1];
+  *o3 = c1.val[1];
+}
+
+static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                                   int32x4_t *a2,
+                                                   int32x4_t *a3) {
+  transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
+}
+
+static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
+                                            int32x4_t *out) {
+  transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
+                          &out[3]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
+                                                        int32x4_t *out,
+                                                        const int width,
+                                                        const int height) {
+  const int h = height >> 2;
+  const int w = width >> 2;
+  for (int j = 0; j < w; j++) {
+    for (int i = 0; i < h; i++) {
+      transpose_arrays_s32_4x4(in + j * height + i * 4,
+                               out + i * width + j * 4);
+    }
+  }
+}
+
+#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h)                    \
+  static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
+      const int32x4_t *in, int32x4_t *out) {                   \
+    transpose_arrays_s32_4nx4n(in, out, w, h);                 \
+  }
+
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
+
+#undef TRANSPOSE_ARRAYS_S32_WXH_NEON
+
+static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+  return vtrn1q_s64(a, b);
+#else
+  return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
+#endif
+}
+
+static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+  return vtrn2q_s64(a, b);
+#else
+  return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
+#endif
+}
+
+static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
+                                           int32x4_t a2, int32x4_t a3,
+                                           int32x4_t a4, int32x4_t a5,
+                                           int32x4_t a6, int32x4_t a7,
+                                           int32x4x2_t *o0, int32x4x2_t *o1,
+                                           int32x4x2_t *o2, int32x4x2_t *o3) {
+  // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
+  // matrix transpose implementation:
+  // [ A ]^T => [ A^T B^T ]
+  // [ B ]
+
+  transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3);  // A^T
+  transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7);  // B^T
+
+  o0->val[0] = a0;
+  o1->val[0] = a1;
+  o2->val[0] = a2;
+  o3->val[0] = a3;
+
+  o0->val[1] = a4;
+  o1->val[1] = a5;
+  o2->val[1] = a6;
+  o3->val[1] = a7;
+}
+
+static INLINE void transpose_elems_inplace_s32_8x8(
+    int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
+    int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
+  // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
+  // matrix transpose implementation:
+  // [ A B ]^T => [ A^T C^T ]
+  // [ C D ]      [ B^T D^T ]
+
+  int32x4_t q0_v1 = a0->val[0];
+  int32x4_t q0_v2 = a1->val[0];
+  int32x4_t q0_v3 = a2->val[0];
+  int32x4_t q0_v4 = a3->val[0];
+
+  int32x4_t q1_v1 = a0->val[1];
+  int32x4_t q1_v2 = a1->val[1];
+  int32x4_t q1_v3 = a2->val[1];
+  int32x4_t q1_v4 = a3->val[1];
+
+  int32x4_t q2_v1 = a4->val[0];
+  int32x4_t q2_v2 = a5->val[0];
+  int32x4_t q2_v3 = a6->val[0];
+  int32x4_t q2_v4 = a7->val[0];
+
+  int32x4_t q3_v1 = a4->val[1];
+  int32x4_t q3_v2 = a5->val[1];
+  int32x4_t q3_v3 = a6->val[1];
+  int32x4_t q3_v4 = a7->val[1];
+
+  transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4);  // A^T
+  transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4);  // B^T
+  transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4);  // C^T
+  transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4);  // D^T
+
+  a0->val[0] = q0_v1;
+  a1->val[0] = q0_v2;
+  a2->val[0] = q0_v3;
+  a3->val[0] = q0_v4;
+
+  a0->val[1] = q2_v1;
+  a1->val[1] = q2_v2;
+  a2->val[1] = q2_v3;
+  a3->val[1] = q2_v4;
+
+  a4->val[0] = q1_v1;
+  a5->val[0] = q1_v2;
+  a6->val[0] = q1_v3;
+  a7->val[0] = q1_v4;
+
+  a4->val[1] = q3_v1;
+  a5->val[1] = q3_v2;
+  a6->val[1] = q3_v3;
+  a7->val[1] = q3_v4;
+}
+
+static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
+                                            int16x4_t *const out) {
+  int16x4_t a0 = in[0];
+  int16x4_t a1 = in[1];
+  int16x4_t a2 = in[2];
+  int16x4_t a3 = in[3];
+
+  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+  out[0] = a0;
+  out[1] = a1;
+  out[2] = a2;
+  out[3] = a3;
+}
+
+static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
+                                            int16x8_t *const out) {
+#if AOM_ARCH_AARCH64
+  const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
+                                  vcombine_s16(in[1], vdup_n_s16(0)));
+  const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
+                                  vcombine_s16(in[3], vdup_n_s16(0)));
+  const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
+                                  vcombine_s16(in[5], vdup_n_s16(0)));
+  const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
+                                  vcombine_s16(in[7], vdup_n_s16(0)));
+#else
+  int16x4x2_t temp;
+  temp = vzip_s16(in[0], in[1]);
+  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(in[2], in[3]);
+  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(in[4], in[5]);
+  const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(in[6], in[7]);
+  const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+  const int32x4x2_t b02 =
+      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+  const int32x4x2_t b13 =
+      vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if AOM_ARCH_AARCH64
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+  out[1] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+  out[3] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
+                                            int16x4_t *const out) {
+  // Swap 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
+                                    vreinterpretq_u32_s16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
+                                    vreinterpretq_u32_s16(b1.val[1]));
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+
+  out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
+  out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
+  out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
+  out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
+  out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
+  out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
+  out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
+  out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
 }
 
 #endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 5e33996..9e4e8c0 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -11,153 +11,12 @@
 
 #include <arm_neon.h>
 
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
+#include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
-
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride, int h,
-                                     uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint8x16_t s = load_unaligned_u8q(src, src_stride);
-    uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src += 4 * src_stride;
-    ref += 4 * ref_stride;
-    i -= 4;
-  } while (i != 0);
-
-  int32x4_t sum_diff =
-      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
-  *sum = horizontal_add_s32x4(sum_diff);
-  *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride, int h,
-                                     uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
-    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  int32x4_t sum_diff =
-      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
-  *sum = horizontal_add_s32x4(sum_diff);
-  *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint8x16_t s = vld1q_u8(src);
-    uint8x16_t r = vld1q_u8(ref);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src += src_stride;
-    ref += ref_stride;
-  } while (--i != 0);
-
-  int32x4_t sum_diff =
-      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
-  *sum = horizontal_add_s32x4(sum_diff);
-  *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride,
-                                       int w, int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s = vld1q_u8(src + j);
-      uint8x16_t r = vld1q_u8(ref + j);
-
-      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-      uint8x16_t abs_diff = vabdq_u8(s, r);
-      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-      j += 16;
-    } while (j < w);
-
-    src += src_stride;
-    ref += ref_stride;
-  } while (--i != 0);
-
-  int32x4_t sum_diff =
-      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
-  *sum = horizontal_add_s32x4(sum_diff);
-  *sse = horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
-}
-
-static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
-}
-
-static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride,
-                                       int h, uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse, sum);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
                                      const uint8_t *ref, int ref_stride, int h,
@@ -333,8 +192,6 @@
   variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define VARIANCE_WXH_NEON(w, h, shift)                                        \
   unsigned int aom_variance##w##x##h##_neon(                                  \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -382,7 +239,7 @@
                                        uint32_t *sse8x8, int *sum8x8,
                                        unsigned int *tot_sse, int *tot_sum,
                                        uint32_t *var8x8) {
-  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  // Loop over four 8x8 blocks. Process one 8x32 block.
   for (int k = 0; k < 4; k++) {
     variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
                       &sse8x8[k], &sum8x8[k]);
@@ -390,8 +247,9 @@
 
   *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
   *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
-  for (int i = 0; i < 4; i++)
+  for (int i = 0; i < 4; i++) {
     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+  }
 }
 
 void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
@@ -400,7 +258,7 @@
                                          unsigned int *tot_sse, int *tot_sum,
                                          uint32_t *var16x16) {
   int sum16x16[2] = { 0 };
-  // Loop over 2 16x16 blocks. Process one 16x32 block.
+  // Loop over two 16x16 blocks. Process one 16x32 block.
   for (int k = 0; k < 2; k++) {
     variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
                        16, &sse16x16[k], &sum16x16[k]);
@@ -408,65 +266,12 @@
 
   *tot_sse += sse16x16[0] + sse16x16[1];
   *tot_sum += sum16x16[0] + sum16x16[1];
-  for (int i = 0; i < 2; i++)
+  for (int i = 0; i < 2; i++) {
     var16x16[i] =
         sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+  }
 }
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride,
-                                       unsigned int *sse, int h) {
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
-    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
-
-    uint8x16_t abs_diff = vabdq_u8(s, r);
-
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  *sse = horizontal_add_u32x4(sse_u32);
-  return horizontal_add_u32x4(sse_u32);
-}
-
-static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
-                                        const uint8_t *ref, int ref_stride,
-                                        unsigned int *sse, int h) {
-  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    uint8x16_t s0 = vld1q_u8(src);
-    uint8x16_t s1 = vld1q_u8(src + src_stride);
-    uint8x16_t r0 = vld1q_u8(ref);
-    uint8x16_t r1 = vld1q_u8(ref + ref_stride);
-
-    uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
-    uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
-
-    sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
-    sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
-
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-  return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
                                        const uint8_t *ref, int ref_stride,
                                        unsigned int *sse, int h) {
@@ -564,8 +369,6 @@
   return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 #define MSE_WXH_NEON(w, h)                                                 \
   unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
                                        const uint8_t *ref, int ref_stride, \
@@ -581,98 +384,87 @@
 
 #undef MSE_WXH_NEON
 
-#define COMPUTE_MSE_16BIT(src_16x8, dst_16x8)                           \
-  /* r7 r6 r5 r4 r3 r2 r1 r0 - 16 bit */                                \
-  const uint16x8_t diff = vabdq_u16(src_16x8, dst_16x8);                \
-  /*r3 r2 r1 r0 - 16 bit */                                             \
-  const uint16x4_t res0_low_16x4 = vget_low_u16(diff);                  \
-  /*r7 r6 r5 r4 - 16 bit */                                             \
-  const uint16x4_t res0_high_16x4 = vget_high_u16(diff);                \
-  /* (r3*r3)= b3 (r2*r2)= b2 (r1*r1)= b1 (r0*r0)= b0 - 32 bit */        \
-  const uint32x4_t res0_32x4 = vmull_u16(res0_low_16x4, res0_low_16x4); \
-  /* (r7*r7)= b7 (r6*r6)= b6 (r5*r5)= b5 (r4*r4)= b4 - 32 bit*/         \
-  /* b3+b7 b2+b6 b1+b5 b0+b4 - 32 bit*/                                 \
-  const uint32x4_t res_32x4 =                                           \
-      vmlal_u16(res0_32x4, res0_high_16x4, res0_high_16x4);             \
-                                                                        \
-  /*a1 a0 - 64 bit*/                                                    \
-  const uint64x2_t vl = vpaddlq_u32(res_32x4);                          \
-  /*a1+a2= f1 a3+a0= f0*/                                               \
-  square_result = vaddq_u64(square_result, vl);
+static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
+                                                   uint16x8_t s0, uint16x8_t s1,
+                                                   uint8x8_t d0, uint8x8_t d1) {
+  int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
+  int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1));
 
-static AOM_INLINE uint64_t mse_4xh_16bit_neon(uint8_t *dst, int dstride,
-                                              uint16_t *src, int sstride,
-                                              int h) {
-  uint64x2_t square_result = vdupq_n_u64(0);
-  uint32_t d0, d1;
-  int i = h;
-  uint8_t *dst_ptr = dst;
-  uint16_t *src_ptr = src;
-  do {
-    // d03 d02 d01 d00 - 8 bit
-    memcpy(&d0, dst_ptr, 4);
-    dst_ptr += dstride;
-    // d13 d12 d11 d10 - 8 bit
-    memcpy(&d1, dst_ptr, 4);
-    dst_ptr += dstride;
-    // duplication
-    uint8x8_t tmp0_8x8 = vreinterpret_u8_u32(vdup_n_u32(d0));
-    // d03 d02 d01 d00 - 16 bit
-    const uint16x4_t dst0_16x4 = vget_low_u16(vmovl_u8(tmp0_8x8));
-    // duplication
-    tmp0_8x8 = vreinterpret_u8_u32(vdup_n_u32(d1));
-    // d13 d12 d11 d10 - 16 bit
-    const uint16x4_t dst1_16x4 = vget_low_u16(vmovl_u8(tmp0_8x8));
-    // d13 d12 d11 d10 d03 d02 d01 d00 - 16 bit
-    const uint16x8_t dst_16x8 = vcombine_u16(dst0_16x4, dst1_16x4);
+  int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0));
+  mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0));
+  mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1));
+  mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1));
 
-    // b1r0 - s03 s02 s01 s00 - 16 bit
-    const uint16x4_t src0_16x4 = vld1_u16(src_ptr);
-    src_ptr += sstride;
-    // b1r1 - s13 s12 s11 s10 - 16 bit
-    const uint16x4_t src1_16x4 = vld1_u16(src_ptr);
-    src_ptr += sstride;
-    // s13 s12 s11 s10 s03 s02 s01 s00 - 16 bit
-    const uint16x8_t src_16x8 = vcombine_u16(src0_16x4, src1_16x4);
-
-    COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
-    i -= 2;
-  } while (i != 0);
-  uint64x1_t sum =
-      vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
-  return vget_lane_u64(sum, 0);
+  return vpadalq_u32(sum, vreinterpretq_u32_s32(mse));
 }
 
-static AOM_INLINE uint64_t mse_8xh_16bit_neon(uint8_t *dst, int dstride,
-                                              uint16_t *src, int sstride,
-                                              int h) {
-  uint64x2_t square_result = vdupq_n_u64(0);
-  int i = h;
-  do {
-    // d7 d6 d5 d4 d3 d2 d1 d0 - 8 bit
-    const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(dst));
-    // s7 s6 s5 s4 s3 s2 s1 s0 - 16 bit
-    const uint16x8_t src_16x8 = vld1q_u16(src);
+static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4));
 
-    COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
+  uint64x2_t sum = vdupq_n_u64(0);
 
-    dst += dstride;
-    src += sstride;
-  } while (--i != 0);
-  uint64x1_t sum =
-      vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
-  return vget_lane_u64(sum, 0);
+  if (w == 8) {
+    do {
+      uint8x8_t d0 = vld1_u8(dst + 0 * dstride);
+      uint8x8_t d1 = vld1_u8(dst + 1 * dstride);
+      uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+      uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+      sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+      dst += 2 * dstride;
+      src += 2 * sstride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride);
+      uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+      sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+      dst += 4 * dstride;
+      src += 4 * sstride;
+      h -= 4;
+    } while (h != 0);
+  }
+
+  return sum;
 }
 
 // Computes mse for a given block size. This function gets called for specific
 // block sizes, which are 8x8, 8x4, 4x8 and 4x4.
 uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
                                 int sstride, int w, int h) {
-  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
-         "w=8/4 and h=8/4 must satisfy");
-  switch (w) {
-    case 4: return mse_4xh_16bit_neon(dst, dstride, src, sstride, h);
-    case 8: return mse_8xh_16bit_neon(dst, dstride, src, sstride, h);
-    default: assert(0 && "unsupported width"); return -1;
+  return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h));
+}
+
+uint32_t aom_get_mb_ss_neon(const int16_t *a) {
+  int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  for (int i = 0; i < 256; i = i + 8) {
+    int16x8_t a_s16 = vld1q_s16(a + i);
+
+    sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16));
+    sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16));
   }
+
+  return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
+}
+
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  int num_blks = 16 / w;
+  do {
+    sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h));
+    dst += w;
+    src += w * h;
+  } while (--num_blks != 0);
+
+  return horizontal_add_u64x2(sum);
 }
diff --git a/aom_dsp/arm/variance_neon_dotprod.c b/aom_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000..9fb52e1
--- /dev/null
+++ b/aom_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = load_unaligned_u8q(src, src_stride);
+    uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 4 * src_stride;
+    ref += 4 * ref_stride;
+    i -= 4;
+  } while (i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src);
+    uint8x16_t r = vld1q_u8(ref);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_large_neon_dotprod(const uint8_t *src,
+                                               int src_stride,
+                                               const uint8_t *ref,
+                                               int ref_stride, int w, int h,
+                                               uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src + j);
+      uint8x16_t r = vld1q_u8(ref + j);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      j += 16;
+    } while (j < w);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+                              sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+                              sum);
+}
+
+static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
+                                               int src_stride,
+                                               const uint8_t *ref,
+                                               int ref_stride, int h,
+                                               uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse,
+                              sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift)                                \
+  unsigned int aom_variance##w##x##h##_neon_dotprod(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse,   \
+                                  &sum);                                      \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
+
+VARIANCE_WXH_NEON_DOTPROD(128, 64, 13)
+VARIANCE_WXH_NEON_DOTPROD(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+void aom_get_var_sse_sum_8x8_quad_neon_dotprod(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum,
+    uint32_t *var8x8) {
+  // Loop over four 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8),
+                              ref_stride, 8, &sse8x8[k], &sum8x8[k]);
+  }
+
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++) {
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+  }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon_dotprod(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum,
+    uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over two 16x16 blocks. Process one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16),
+                               ref_stride, 16, &sse16x16[k], &sum16x16[k]);
+  }
+
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++) {
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+  }
+}
+
+static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
+                                               int src_stride,
+                                               const uint8_t *ref,
+                                               int ref_stride,
+                                               unsigned int *sse, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src,
+                                                int src_stride,
+                                                const uint8_t *ref,
+                                                int ref_stride,
+                                                unsigned int *sse, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t s1 = vld1q_u8(src + src_stride);
+    uint8x16_t r0 = vld1q_u8(ref);
+    uint8x16_t r1 = vld1q_u8(ref + ref_stride);
+
+    uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+    uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+#define MSE_WXH_NEON_DOTPROD(w, h)                                            \
+  unsigned int aom_mse##w##x##h##_neon_dotprod(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \
+  }
+
+MSE_WXH_NEON_DOTPROD(8, 8)
+MSE_WXH_NEON_DOTPROD(8, 16)
+
+MSE_WXH_NEON_DOTPROD(16, 8)
+MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef MSE_WXH_NEON_DOTPROD
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 7b36bf3..893f9c2 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -504,14 +504,14 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
 int aom_satd_c(const tran_low_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i) satd += abs(coeff[i]);
 
-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
   return satd;
 }
 
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index 467e47b..d26f027 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -13,7 +13,7 @@
 #define AOM_AOM_DSP_ENTENC_H_
 #include <stddef.h>
 #include "aom_dsp/entcode.h"
-#include "aom_ports/bitops.h"
+#include "aom_util/endian_inl.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -87,13 +87,14 @@
   } while (carry);
 }
 
-// Reverse byte order and write data to buffer adding the carry-bit
+// Convert to big-endian byte order and write data to buffer adding the
+// carry-bit
 static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
                                                  uint32_t offs, uint64_t output,
                                                  uint64_t carry,
                                                  uint32_t *enc_offs,
                                                  uint8_t num_bytes_ready) {
-  const uint64_t reg = get_byteswap64(output) >> ((8 - num_bytes_ready) << 3);
+  const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3));
   memcpy(&out[offs], &reg, 8);
   // Propagate carry backwards if exists
   if (carry) {
diff --git a/aom_dsp/fft.c b/aom_dsp/fft.c
index cad4a65..a44dbf7 100644
--- a/aom_dsp/fft.c
+++ b/aom_dsp/fft.c
@@ -11,6 +11,7 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"
+#include "config/aom_dsp_rtcd.h"
 
 static INLINE void simple_transpose(const float *A, float *B, int n) {
   for (int y = 0; y < n; y++) {
diff --git a/aom_dsp/fft_common.h b/aom_dsp/fft_common.h
index 5137331..3de1a04 100644
--- a/aom_dsp/fft_common.h
+++ b/aom_dsp/fft_common.h
@@ -47,10 +47,16 @@
 
 // Declare some of the forward non-vectorized transforms which are used in some
 // of the vectorized implementations
+void aom_fft1d_2_float(const float *input, float *output, int stride);
 void aom_fft1d_4_float(const float *input, float *output, int stride);
 void aom_fft1d_8_float(const float *input, float *output, int stride);
 void aom_fft1d_16_float(const float *input, float *output, int stride);
 void aom_fft1d_32_float(const float *input, float *output, int stride);
+void aom_ifft1d_2_float(const float *input, float *output, int stride);
+void aom_ifft1d_4_float(const float *input, float *output, int stride);
+void aom_ifft1d_8_float(const float *input, float *output, int stride);
+void aom_ifft1d_16_float(const float *input, float *output, int stride);
+void aom_ifft1d_32_float(const float *input, float *output, int stride);
 
 /**\!brief Function pointer for transposing a matrix of floats.
  *
diff --git a/aom_dsp/flow_estimation/arm/disflow_neon.c b/aom_dsp/flow_estimation/arm/disflow_neon.c
new file mode 100644
index 0000000..e2ba0e0
--- /dev/null
+++ b/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void get_cubic_kernel_dbl(double x, double *kernel) {
+  assert(0 <= x && x < 1);
+  double x2 = x * x;
+  double x3 = x2 * x;
+  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+  kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int *kernel) {
+  double kernel_dbl[4];
+  get_cubic_kernel_dbl(x, kernel_dbl);
+
+  kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+  kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+  kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+  kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+                                      int width, int height, int stride, int x,
+                                      int y, double u, double v, int16_t *dt) {
+  // Split offset into integer and fractional parts, and compute cubic
+  // interpolation kernels
+  const int u_int = (int)floor(u);
+  const int v_int = (int)floor(v);
+  const double u_frac = u - floor(u);
+  const double v_frac = v - floor(v);
+
+  int h_kernel[4];
+  int v_kernel[4];
+  get_cubic_kernel_int(u_frac, h_kernel);
+  get_cubic_kernel_int(v_frac, v_kernel);
+
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+
+  // Clamp coordinates so that all pixels we fetch will remain within the
+  // allocated border region, but allow them to go far enough out that
+  // the border pixels' values do not change.
+  // Since we are calculating an 8x8 block, the bottom-right pixel
+  // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+  // interpolation has 4 taps, meaning that the output of pixel
+  // (x_w, y_w) depends on the pixels in the range
+  // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+  //
+  // Thus the most extreme coordinates which will be fetched are
+  // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+  const int x0 = clamp(x + u_int, -9, width);
+  const int y0 = clamp(y + v_int, -9, height);
+
+  // Horizontal convolution.
+  const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1);
+  int16x4_t h_filter = vmovn_s32(vld1q_s32(h_kernel));
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) {
+    uint8x16_t r = vld1q_u8(ref_start + i * stride);
+    uint16x8_t r0 = vmovl_u8(vget_low_u8(r));
+    uint16x8_t r1 = vmovl_u8(vget_high_u8(r));
+
+    int16x8_t s0 = vreinterpretq_s16_u16(r0);
+    int16x8_t s1 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 3));
+
+    int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), h_filter, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), h_filter, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), h_filter, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), h_filter, 3);
+
+    int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), h_filter, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), h_filter, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), h_filter, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), h_filter, 3);
+
+    // 6 is the maximum allowable number of extra bits which will avoid
+    // the intermediate values overflowing an int16_t. The most extreme
+    // intermediate value occurs when:
+    // * The input pixels are [0, 255, 255, 0]
+    // * u_frac = 0.5
+    // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+    // As an integer with 6 fractional bits, that is 18360, which fits
+    // in an int16_t. But with 7 fractional bits it would be 36720,
+    // which is too large.
+
+    int16x8_t sum = vcombine_s16(vrshrn_n_s32(sum_lo, DISFLOW_INTERP_BITS - 6),
+                                 vrshrn_n_s32(sum_hi, DISFLOW_INTERP_BITS - 6));
+    vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, sum);
+  }
+
+  // Vertical convolution.
+  int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel));
+  int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+    int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE);
+    int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE);
+    int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE);
+    int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE);
+
+    int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3);
+
+    int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3);
+
+    uint8x8_t s = vld1_u8(src + (i + y) * stride + x);
+    int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3));
+
+    // This time, we have to round off the 6 extra bits which were kept
+    // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+    // of precision to match the scale of the dx and dy arrays.
+    sum_lo = vrshrq_n_s32(sum_lo,
+                          DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+    sum_hi = vrshrq_n_s32(sum_hi,
+                          DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+    int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16));
+    int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16));
+    vst1q_s16(dt + i * DISFLOW_PATCH_SIZE,
+              vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi)));
+  }
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+  // Horizontal filter, using kernel {1, 0, -1}.
+  const uint8_t *src_start = src - 1 * src_stride - 1;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+    uint8x8_t s0 = vget_low_u8(s);
+    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+    // Given that the kernel is {1, 0, -1} the convolution is a simple
+    // subtraction.
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
+
+    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
+  }
+
+  // Vertical filter, using kernel {1, 2, 1}.
+  // This kernel can be split into two 2-taps kernels of value {1, 1}.
+  // That way we need only 3 add operations to perform the convolution, one of
+  // which can be reused for the next line.
+  int16x8_t s0 = vld1q_s16(tmp);
+  int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
+  int16x8_t sum01 = vaddq_s16(s0, s1);
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
+
+    int16x8_t sum12 = vaddq_s16(s1, s2);
+    int16x8_t sum = vaddq_s16(sum01, sum12);
+
+    vst1q_s16(dst + i * dst_stride, sum);
+
+    sum01 = sum12;
+    s1 = s2;
+  }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+  // Horizontal filter, using kernel {1, 2, 1}.
+  // This kernel can be split into two 2-taps kernels of value {1, 1}.
+  // That way we need only 3 add operations to perform the convolution.
+  const uint8_t *src_start = src - 1 * src_stride - 1;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+    uint8x8_t s0 = vget_low_u8(s);
+    uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
+    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+    uint16x8_t sum01 = vaddl_u8(s0, s1);
+    uint16x8_t sum12 = vaddl_u8(s1, s2);
+    uint16x8_t sum = vaddq_u16(sum01, sum12);
+
+    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
+  }
+
+  // Vertical filter, using kernel {1, 0, -1}.
+  // Load the whole block at once to avoid redundant loads during convolution.
+  int16x8_t t[10];
+  load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
+                &t[5], &t[6], &t[7], &t[8], &t[9]);
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    // Given that the kernel is {1, 0, -1} the convolution is a simple
+    // subtraction.
+    int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
+
+    vst1q_s16(dst + i * dst_stride, diff);
+  }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+//    (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+//   u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+//   M = |sum(dx * dx)  sum(dx * dy)|
+//       |sum(dx * dy)  sum(dy * dy)|
+//
+//   b = |sum(dx * dt)|
+//       |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       double *M_inv) {
+  int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                       vdupq_n_s32(0) };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t x = vld1q_s16(dx + i * dx_stride);
+    int16x8_t y = vld1q_s16(dy + i * dy_stride);
+    sum[0] = vmlal_s16(sum[0], vget_low_s16(x), vget_low_s16(x));
+    sum[0] = vmlal_s16(sum[0], vget_high_s16(x), vget_high_s16(x));
+
+    sum[1] = vmlal_s16(sum[1], vget_low_s16(x), vget_low_s16(y));
+    sum[1] = vmlal_s16(sum[1], vget_high_s16(x), vget_high_s16(y));
+
+    sum[3] = vmlal_s16(sum[3], vget_low_s16(y), vget_low_s16(y));
+    sum[3] = vmlal_s16(sum[3], vget_high_s16(y), vget_high_s16(y));
+  }
+  sum[2] = sum[1];
+
+  int32x4_t res = horizontal_add_4d_s32x4(sum);
+
+  // Apply regularization
+  // We follow the standard regularization method of adding `k * I` before
+  // inverting. This ensures that the matrix will be invertible.
+  //
+  // Setting the regularization strength k to 1 seems to work well here, as
+  // typical values coming from the other equations are very large (1e5 to
+  // 1e6, with an upper limit of around 6e7, at the time of writing).
+  // It also preserves the property that all matrix values are whole numbers,
+  // which is convenient for integerized SIMD implementation.
+
+  double M0 = (double)vgetq_lane_s32(res, 0) + 1;
+  double M1 = (double)vgetq_lane_s32(res, 1);
+  double M2 = (double)vgetq_lane_s32(res, 2);
+  double M3 = (double)vgetq_lane_s32(res, 3) + 1;
+
+  // Invert matrix M.
+  double det = (M0 * M3) - (M1 * M2);
+  assert(det >= 1);
+  const double det_inv = 1 / det;
+
+  M_inv[0] = M3 * det_inv;
+  M_inv[1] = -M1 * det_inv;
+  M_inv[2] = -M2 * det_inv;
+  M_inv[3] = M0 * det_inv;
+}
+
+static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       const int16_t *dt, int dt_stride,
+                                       int *b) {
+  int32x4_t b_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t dx16 = vld1q_s16(dx + i * dx_stride);
+    int16x8_t dy16 = vld1q_s16(dy + i * dy_stride);
+    int16x8_t dt16 = vld1q_s16(dt + i * dt_stride);
+
+    b_s32[0] = vmlal_s16(b_s32[0], vget_low_s16(dx16), vget_low_s16(dt16));
+    b_s32[0] = vmlal_s16(b_s32[0], vget_high_s16(dx16), vget_high_s16(dt16));
+
+    b_s32[1] = vmlal_s16(b_s32[1], vget_low_s16(dy16), vget_low_s16(dt16));
+    b_s32[1] = vmlal_s16(b_s32[1], vget_high_s16(dy16), vget_high_s16(dt16));
+  }
+
+  int32x4_t b_red = horizontal_add_2d_s32(b_s32[0], b_s32[1]);
+  vst1_s32(b, add_pairwise_s32x4(b_red));
+}
+
+void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref,
+                                    int x, int y, int width, int height,
+                                    int stride, double *u, double *v) {
+  double M_inv[4];
+  int b[2];
+  int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+  // Compute gradients within this patch
+  const uint8_t *src_patch = &src[y * stride + x];
+  sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+  sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+  compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv);
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
+    compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
+                        DISFLOW_PATCH_SIZE, b);
+
+    // Solve flow equations to find a better estimate for the flow vector
+    // at this point
+    const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+    const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+    *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+    *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+    if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+      // Stop iteration when we're close to convergence
+      break;
+    }
+  }
+}
diff --git a/aom_dsp/flow_estimation/corner_detect.c b/aom_dsp/flow_estimation/corner_detect.c
index 7848295..284d1bd 100644
--- a/aom_dsp/flow_estimation/corner_detect.c
+++ b/aom_dsp/flow_estimation/corner_detect.c
@@ -24,10 +24,10 @@
 
 #define FAST_BARRIER 18
 
-size_t av1_get_corner_list_size() { return sizeof(CornerList); }
+size_t av1_get_corner_list_size(void) { return sizeof(CornerList); }
 
-CornerList *av1_alloc_corner_list() {
-  CornerList *corners = (CornerList *)aom_calloc(1, sizeof(CornerList));
+CornerList *av1_alloc_corner_list(void) {
+  CornerList *corners = (CornerList *)aom_calloc(1, sizeof(*corners));
   if (!corners) {
     return NULL;
   }
@@ -39,7 +39,7 @@
   return corners;
 }
 
-void compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
   const uint8_t *buf = pyr->layers[0].buffer;
   int width = pyr->layers[0].width;
   int height = pyr->layers[0].height;
@@ -49,14 +49,14 @@
   int num_corners;
   xy *const frame_corners_xy = aom_fast9_detect_nonmax(
       buf, width, height, stride, FAST_BARRIER, &scores, &num_corners);
+  if (num_corners < 0) return false;
 
-  if (num_corners <= 0) {
-    // Some error occured, so no corners are available
-    corners->num_corners = 0;
-  } else if (num_corners <= MAX_CORNERS) {
+  if (num_corners <= MAX_CORNERS) {
     // Use all detected corners
-    memcpy(corners->corners, frame_corners_xy,
-           sizeof(*frame_corners_xy) * num_corners);
+    if (num_corners != 0) {
+      memcpy(corners->corners, frame_corners_xy,
+             sizeof(*frame_corners_xy) * num_corners);
+    }
     corners->num_corners = num_corners;
   } else {
     // There are more than MAX_CORNERS corners avilable, so pick out a subset
@@ -96,9 +96,10 @@
 
   free(scores);
   free(frame_corners_xy);
+  return true;
 }
 
-void av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
   assert(corners);
 
 #if CONFIG_MULTITHREAD
@@ -106,13 +107,14 @@
 #endif  // CONFIG_MULTITHREAD
 
   if (!corners->valid) {
-    compute_corner_list(pyr, corners);
-    corners->valid = true;
+    corners->valid = compute_corner_list(pyr, corners);
   }
+  bool valid = corners->valid;
 
 #if CONFIG_MULTITHREAD
   pthread_mutex_unlock(&corners->mutex);
 #endif  // CONFIG_MULTITHREAD
+  return valid;
 }
 
 #ifndef NDEBUG
diff --git a/aom_dsp/flow_estimation/corner_detect.h b/aom_dsp/flow_estimation/corner_detect.h
index c77813e..d05846c 100644
--- a/aom_dsp/flow_estimation/corner_detect.h
+++ b/aom_dsp/flow_estimation/corner_detect.h
@@ -53,11 +53,11 @@
   int corners[2 * MAX_CORNERS];
 } CornerList;
 
-size_t av1_get_corner_list_size();
+size_t av1_get_corner_list_size(void);
 
-CornerList *av1_alloc_corner_list();
+CornerList *av1_alloc_corner_list(void);
 
-void av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners);
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners);
 
 #ifndef NDEBUG
 // Check if a corner list has already been computed.
diff --git a/aom_dsp/flow_estimation/corner_match.c b/aom_dsp/flow_estimation/corner_match.c
index f34178e..cef719b 100644
--- a/aom_dsp/flow_estimation/corner_match.c
+++ b/aom_dsp/flow_estimation/corner_match.c
@@ -147,13 +147,13 @@
   }
 }
 
-int aom_determine_correspondence(const unsigned char *src,
-                                 const int *src_corners, int num_src_corners,
-                                 const unsigned char *ref,
-                                 const int *ref_corners, int num_ref_corners,
-                                 int width, int height, int src_stride,
-                                 int ref_stride,
-                                 Correspondence *correspondences) {
+static int determine_correspondence(const unsigned char *src,
+                                    const int *src_corners, int num_src_corners,
+                                    const unsigned char *ref,
+                                    const int *ref_corners, int num_ref_corners,
+                                    int width, int height, int src_stride,
+                                    int ref_stride,
+                                    Correspondence *correspondences) {
   // TODO(sarahparker) Improve this to include 2-way match
   int i, j;
   int num_correspondences = 0;
@@ -202,7 +202,8 @@
 
 bool av1_compute_global_motion_feature_match(
     TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
-    int bit_depth, MotionModel *motion_models, int num_motion_models) {
+    int bit_depth, MotionModel *motion_models, int num_motion_models,
+    bool *mem_alloc_failed) {
   int num_correspondences;
   Correspondence *correspondences;
   ImagePyramid *src_pyramid = src->y_pyramid;
@@ -211,10 +212,22 @@
   CornerList *ref_corners = ref->corners;
 
   // Precompute information we will need about each frame
-  aom_compute_pyramid(src, bit_depth, src_pyramid);
-  av1_compute_corner_list(src_pyramid, src_corners);
-  aom_compute_pyramid(ref, bit_depth, ref_pyramid);
-  av1_compute_corner_list(ref_pyramid, ref_corners);
+  if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
 
   const uint8_t *src_buffer = src_pyramid->layers[0].buffer;
   const int src_width = src_pyramid->layers[0].width;
@@ -229,14 +242,17 @@
   // find correspondences between the two images
   correspondences = (Correspondence *)aom_malloc(src_corners->num_corners *
                                                  sizeof(*correspondences));
-  if (!correspondences) return false;
-  num_correspondences = aom_determine_correspondence(
+  if (!correspondences) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  num_correspondences = determine_correspondence(
       src_buffer, src_corners->corners, src_corners->num_corners, ref_buffer,
       ref_corners->corners, ref_corners->num_corners, src_width, src_height,
       src_stride, ref_stride, correspondences);
 
   bool result = ransac(correspondences, num_correspondences, type,
-                       motion_models, num_motion_models);
+                       motion_models, num_motion_models, mem_alloc_failed);
 
   aom_free(correspondences);
   return result;
diff --git a/aom_dsp/flow_estimation/corner_match.h b/aom_dsp/flow_estimation/corner_match.h
index bb69944..4435d2c 100644
--- a/aom_dsp/flow_estimation/corner_match.h
+++ b/aom_dsp/flow_estimation/corner_match.h
@@ -29,17 +29,10 @@
 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
 #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
 
-int aom_determine_correspondence(const unsigned char *src,
-                                 const int *src_corners, int num_src_corners,
-                                 const unsigned char *ref,
-                                 const int *ref_corners, int num_ref_corners,
-                                 int width, int height, int src_stride,
-                                 int ref_stride,
-                                 Correspondence *correspondences);
-
 bool av1_compute_global_motion_feature_match(
     TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
-    int bit_depth, MotionModel *motion_models, int num_motion_models);
+    int bit_depth, MotionModel *motion_models, int num_motion_models,
+    bool *mem_alloc_failed);
 
 #ifdef __cplusplus
 }
diff --git a/aom_dsp/flow_estimation/disflow.c b/aom_dsp/flow_estimation/disflow.c
index a8e7b06..a010c81 100644
--- a/aom_dsp/flow_estimation/disflow.c
+++ b/aom_dsp/flow_estimation/disflow.c
@@ -154,9 +154,13 @@
 // (x, y) in src and the other at (x + u, y + v) in ref.
 // This function returns the sum of squared pixel differences between
 // the two regions.
-static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
-                                      int width, int height, int stride, int x,
-                                      int y, double u, double v, int16_t *dt) {
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+                                       int width, int height, int stride, int x,
+                                       int y, double u, double v,
+                                       const int16_t *dx, const int16_t *dy,
+                                       int *b) {
+  memset(b, 0, 2 * sizeof(*b));
+
   // Split offset into integer and fractional parts, and compute cubic
   // interpolation kernels
   const int u_int = (int)floor(u);
@@ -230,8 +234,9 @@
       const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2;
       const int warped = ROUND_POWER_OF_TWO(result, round_bits);
       const int src_px = src[(x + j) + (y + i) * stride] << 3;
-      const int err = warped - src_px;
-      dt[i * DISFLOW_PATCH_SIZE + j] = err;
+      const int dt = warped - src_px;
+      b[0] += dx[i * DISFLOW_PATCH_SIZE + j] * dt;
+      b[1] += dy[i * DISFLOW_PATCH_SIZE + j] * dt;
     }
   }
 }
@@ -351,20 +356,6 @@
   M[3] = (double)tmp[3];
 }
 
-static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
-                                       const int16_t *dy, int dy_stride,
-                                       const int16_t *dt, int dt_stride,
-                                       int *b) {
-  memset(b, 0, 2 * sizeof(*b));
-
-  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
-    for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
-      b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
-      b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
-    }
-  }
-}
-
 // Try to invert the matrix M
 // Note: Due to the nature of how a least-squares matrix is constructed, all of
 // the eigenvalues will be >= 0, and therefore det M >= 0 as well.
@@ -388,7 +379,6 @@
   double M[4];
   double M_inv[4];
   int b[2];
-  int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
   int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
   int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
 
@@ -401,9 +391,8 @@
   invert_2x2(M, M_inv);
 
   for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
-    compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
-    compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
-                        DISFLOW_PATCH_SIZE, b);
+    compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+                        b);
 
     // Solve flow equations to find a better estimate for the flow vector
     // at this point
@@ -463,8 +452,9 @@
 }
 
 // make sure flow_u and flow_v start at 0
-static void compute_flow_field(const ImagePyramid *src_pyr,
+static bool compute_flow_field(const ImagePyramid *src_pyr,
                                const ImagePyramid *ref_pyr, FlowField *flow) {
+  bool mem_status = true;
   assert(src_pyr->n_levels == ref_pyr->n_levels);
 
   double *flow_u = flow->u;
@@ -473,6 +463,10 @@
   const size_t flow_size = flow->stride * (size_t)flow->height;
   double *u_upscale = aom_malloc(flow_size * sizeof(*u_upscale));
   double *v_upscale = aom_malloc(flow_size * sizeof(*v_upscale));
+  if (!u_upscale || !v_upscale) {
+    mem_status = false;
+    goto free_uvscale;
+  }
 
   // Compute flow field from coarsest to finest level of the pyramid
   for (int level = src_pyr->n_levels - 1; level >= 0; --level) {
@@ -522,12 +516,16 @@
       const int upscale_flow_height = cur_flow_height << 1;
       const int upscale_stride = flow->stride;
 
-      av1_upscale_plane_double_prec(
+      bool upscale_u_plane = av1_upscale_plane_double_prec(
           flow_u, cur_flow_height, cur_flow_width, cur_flow_stride, u_upscale,
           upscale_flow_height, upscale_flow_width, upscale_stride);
-      av1_upscale_plane_double_prec(
+      bool upscale_v_plane = av1_upscale_plane_double_prec(
           flow_v, cur_flow_height, cur_flow_width, cur_flow_stride, v_upscale,
           upscale_flow_height, upscale_flow_width, upscale_stride);
+      if (!upscale_u_plane || !upscale_v_plane) {
+        mem_status = false;
+        goto free_uvscale;
+      }
 
       // Multiply all flow vectors by 2.
       // When we move down a pyramid level, the image resolution doubles.
@@ -569,8 +567,10 @@
       }
     }
   }
+free_uvscale:
   aom_free(u_upscale);
   aom_free(v_upscale);
+  return mem_status;
 }
 
 static FlowField *alloc_flow_field(int frame_width, int frame_height) {
@@ -612,14 +612,24 @@
                                        YV12_BUFFER_CONFIG *src,
                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
                                        MotionModel *motion_models,
-                                       int num_motion_models) {
+                                       int num_motion_models,
+                                       bool *mem_alloc_failed) {
   // Precompute information we will need about each frame
   ImagePyramid *src_pyramid = src->y_pyramid;
   CornerList *src_corners = src->corners;
   ImagePyramid *ref_pyramid = ref->y_pyramid;
-  aom_compute_pyramid(src, bit_depth, src_pyramid);
-  av1_compute_corner_list(src_pyramid, src_corners);
-  aom_compute_pyramid(ref, bit_depth, ref_pyramid);
+  if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
 
   const int src_width = src_pyramid->layers[0].width;
   const int src_height = src_pyramid->layers[0].height;
@@ -627,14 +637,22 @@
   assert(ref_pyramid->layers[0].height == src_height);
 
   FlowField *flow = alloc_flow_field(src_width, src_height);
-  if (!flow) return false;
+  if (!flow) {
+    *mem_alloc_failed = true;
+    return false;
+  }
 
-  compute_flow_field(src_pyramid, ref_pyramid, flow);
+  if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) {
+    *mem_alloc_failed = true;
+    free_flow_field(flow);
+    return false;
+  }
 
   // find correspondences between the two images using the flow field
   Correspondence *correspondences =
       aom_malloc(src_corners->num_corners * sizeof(*correspondences));
   if (!correspondences) {
+    *mem_alloc_failed = true;
     free_flow_field(flow);
     return false;
   }
@@ -643,7 +661,7 @@
       determine_disflow_correspondence(src_corners, flow, correspondences);
 
   bool result = ransac(correspondences, num_correspondences, type,
-                       motion_models, num_motion_models);
+                       motion_models, num_motion_models, mem_alloc_failed);
 
   aom_free(correspondences);
   free_flow_field(flow);
diff --git a/aom_dsp/flow_estimation/disflow.h b/aom_dsp/flow_estimation/disflow.h
index 2e97ba2..d772c8a 100644
--- a/aom_dsp/flow_estimation/disflow.h
+++ b/aom_dsp/flow_estimation/disflow.h
@@ -93,7 +93,8 @@
                                        YV12_BUFFER_CONFIG *src,
                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
                                        MotionModel *motion_models,
-                                       int num_motion_models);
+                                       int num_motion_models,
+                                       bool *mem_alloc_failed);
 
 #ifdef __cplusplus
 }
diff --git a/aom_dsp/flow_estimation/flow_estimation.c b/aom_dsp/flow_estimation/flow_estimation.c
index a6bf942..0f47f86 100644
--- a/aom_dsp/flow_estimation/flow_estimation.c
+++ b/aom_dsp/flow_estimation/flow_estimation.c
@@ -44,15 +44,17 @@
                                YV12_BUFFER_CONFIG *ref, int bit_depth,
                                GlobalMotionMethod gm_method,
                                MotionModel *motion_models,
-                               int num_motion_models) {
+                               int num_motion_models, bool *mem_alloc_failed) {
   switch (gm_method) {
     case GLOBAL_MOTION_METHOD_FEATURE_MATCH:
       return av1_compute_global_motion_feature_match(
-          type, src, ref, bit_depth, motion_models, num_motion_models);
+          type, src, ref, bit_depth, motion_models, num_motion_models,
+          mem_alloc_failed);
     case GLOBAL_MOTION_METHOD_DISFLOW:
-      return av1_compute_global_motion_disflow(
-          type, src, ref, bit_depth, motion_models, num_motion_models);
+      return av1_compute_global_motion_disflow(type, src, ref, bit_depth,
+                                               motion_models, num_motion_models,
+                                               mem_alloc_failed);
     default: assert(0 && "Unknown global motion estimation type");
   }
-  return 0;
+  return false;
 }
diff --git a/aom_dsp/flow_estimation/flow_estimation.h b/aom_dsp/flow_estimation/flow_estimation.h
index 4f2192c..2dfae24 100644
--- a/aom_dsp/flow_estimation/flow_estimation.h
+++ b/aom_dsp/flow_estimation/flow_estimation.h
@@ -86,7 +86,7 @@
                                YV12_BUFFER_CONFIG *ref, int bit_depth,
                                GlobalMotionMethod gm_method,
                                MotionModel *motion_models,
-                               int num_motion_models);
+                               int num_motion_models, bool *mem_alloc_failed);
 
 #ifdef __cplusplus
 }
diff --git a/aom_dsp/flow_estimation/ransac.c b/aom_dsp/flow_estimation/ransac.c
index 81c5f2c..b88a07b 100644
--- a/aom_dsp/flow_estimation/ransac.c
+++ b/aom_dsp/flow_estimation/ransac.c
@@ -246,7 +246,8 @@
 // Returns true on success, false on error
 static bool ransac_internal(const Correspondence *matched_points, int npoints,
                             MotionModel *motion_models, int num_desired_motions,
-                            const RansacModelInfo *model_info) {
+                            const RansacModelInfo *model_info,
+                            bool *mem_alloc_failed) {
   assert(npoints >= 0);
   int i = 0;
   int minpts = model_info->minpts;
@@ -297,6 +298,7 @@
   if (!(points1 && points2 && corners1 && corners2 && projected_corners &&
         motions && inlier_buffer)) {
     ret_val = false;
+    *mem_alloc_failed = true;
     goto finish_ransac;
   }
 
@@ -469,7 +471,7 @@
 // Returns true on success, false on error
 bool ransac(const Correspondence *matched_points, int npoints,
             TransformationType type, MotionModel *motion_models,
-            int num_desired_motions) {
+            int num_desired_motions, bool *mem_alloc_failed) {
 #if ALLOW_TRANSLATION_MODELS
   assert(type > IDENTITY && type < TRANS_TYPES);
 #else
@@ -477,5 +479,6 @@
 #endif  // ALLOW_TRANSLATION_MODELS
 
   return ransac_internal(matched_points, npoints, motion_models,
-                         num_desired_motions, &ransac_model_info[type]);
+                         num_desired_motions, &ransac_model_info[type],
+                         mem_alloc_failed);
 }
diff --git a/aom_dsp/flow_estimation/ransac.h b/aom_dsp/flow_estimation/ransac.h
index 6047580..0529b6e 100644
--- a/aom_dsp/flow_estimation/ransac.h
+++ b/aom_dsp/flow_estimation/ransac.h
@@ -26,7 +26,7 @@
 
 bool ransac(const Correspondence *matched_points, int npoints,
             TransformationType type, MotionModel *motion_models,
-            int num_desired_motions);
+            int num_desired_motions, bool *mem_alloc_failed);
 
 #ifdef __cplusplus
 }
diff --git a/aom_dsp/flow_estimation/x86/disflow_sse4.c b/aom_dsp/flow_estimation/x86/disflow_sse4.c
index a62e9a4..77784ee 100644
--- a/aom_dsp/flow_estimation/x86/disflow_sse4.c
+++ b/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -61,12 +61,23 @@
 //
 // TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation
 // instad of bicubic interpolation
-static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
-                                      int width, int height, int stride, int x,
-                                      int y, double u, double v, int16_t *dt) {
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+                                       int width, int height, int stride, int x,
+                                       int y, double u, double v,
+                                       const int16_t *dx, const int16_t *dy,
+                                       int *b) {
   // This function is written to do 8x8 convolutions only
   assert(DISFLOW_PATCH_SIZE == 8);
 
+  // Accumulate 4 32-bit partial sums for each element of b
+  // These will be flattened at the end.
+  __m128i b0_acc = _mm_setzero_si128();
+  __m128i b1_acc = _mm_setzero_si128();
+#if CHECK_RESULTS
+  // Also keep a running sum using the C algorithm, for cross-checking
+  int c_result[2] = { 0 };
+#endif  // CHECK_RESULTS
+
   // Split offset into integer and fractional parts, and compute cubic
   // interpolation kernels
   const int u_int = (int)floor(u);
@@ -231,10 +242,20 @@
     __m128i src_pixels = _mm_slli_epi16(_mm_cvtepu8_epi16(src_pixels_u8), 3);
 
     // Calculate delta from the target patch
-    __m128i err = _mm_sub_epi16(warped, src_pixels);
-    _mm_storeu_si128((__m128i *)&dt[i * DISFLOW_PATCH_SIZE], err);
+    __m128i dt = _mm_sub_epi16(warped, src_pixels);
+
+    // Load 8 elements each of dx and dt, to pair with the 8 elements of dt
+    // that we have just computed. Then compute 8 partial sums of dx * dt
+    // and dy * dt, implicitly sum to give 4 partial sums of each, and
+    // accumulate.
+    __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * DISFLOW_PATCH_SIZE]);
+    __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]);
+    b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt));
+    b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt));
 
 #if CHECK_RESULTS
+    int16_t dt_arr[8];
+    memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr));
     for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
       int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j];
       int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE],
@@ -247,12 +268,28 @@
       // of precision to match the scale of the dx and dy arrays.
       const int c_warped = ROUND_POWER_OF_TWO(result, round_bits);
       const int c_src_px = src[(x + j) + (y + i) * stride] << 3;
-      const int c_err = c_warped - c_src_px;
-      (void)c_err;
-      assert(dt[i * DISFLOW_PATCH_SIZE + j] == c_err);
+      const int c_dt = c_warped - c_src_px;
+
+      assert(dt_arr[j] == c_dt);
+
+      c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt;
+      c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt;
     }
 #endif  // CHECK_RESULTS
   }
+
+  // Flatten the two sets of partial sums to find the final value of b
+  // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc).
+  // We need to do 6 additions in total; a `hadd` instruction can take care
+  // of four of them, leaving two scalar additions.
+  __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc);
+  b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1);
+  b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3);
+
+#if CHECK_RESULTS
+  assert(b[0] == c_result[0]);
+  assert(b[1] == c_result[1]);
+#endif  // CHECK_RESULTS
 }
 
 static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
@@ -401,50 +438,6 @@
   }
 }
 
-static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
-                                       const int16_t *dy, int dy_stride,
-                                       const int16_t *dt, int dt_stride,
-                                       int *b) {
-  __m128i b0_acc = _mm_setzero_si128();
-  __m128i b1_acc = _mm_setzero_si128();
-
-  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
-    // Need to load 8 values of dx, 8 of dy, 8 of dt, which conveniently
-    // works out to one register each. Then just calculate dx * dt, dy * dt,
-    // and (implicitly) sum horizontally in pairs.
-    // This gives four 32-bit partial sums for each of b[0] and b[1],
-    // which can be accumulated and summed at the end.
-    __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * dx_stride]);
-    __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * dy_stride]);
-    __m128i dt_row = _mm_loadu_si128((__m128i *)&dt[i * dt_stride]);
-
-    b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt_row));
-    b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt_row));
-  }
-
-  // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc).
-  // We might as well use a `hadd` instruction to do 4 of the additions
-  // needed here. Then that just leaves two more additions, which can be
-  // done in scalar code
-  __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc);
-  b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1);
-  b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3);
-
-#if CHECK_RESULTS
-  int c_result[2] = { 0 };
-
-  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
-    for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
-      c_result[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
-      c_result[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
-    }
-  }
-
-  assert(b[0] == c_result[0]);
-  assert(b[1] == c_result[1]);
-#endif  // CHECK_RESULTS
-}
-
 static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
                                        const int16_t *dy, int dy_stride,
                                        double *M) {
@@ -528,7 +521,6 @@
   double M[4];
   double M_inv[4];
   int b[2];
-  int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
   int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
   int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
 
@@ -541,9 +533,8 @@
   invert_2x2(M, M_inv);
 
   for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
-    compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
-    compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
-                        DISFLOW_PATCH_SIZE, b);
+    compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+                        b);
 
     // Solve flow equations to find a better estimate for the flow vector
     // at this point
diff --git a/aom_dsp/pyramid.c b/aom_dsp/pyramid.c
index a26d302..324a18b 100644
--- a/aom_dsp/pyramid.c
+++ b/aom_dsp/pyramid.c
@@ -112,7 +112,7 @@
     return NULL;
   }
 
-  pyr->layers = aom_calloc(n_levels, sizeof(PyramidLayer));
+  pyr->layers = aom_calloc(n_levels, sizeof(*pyr->layers));
   if (!pyr->layers) {
     aom_free(pyr);
     return NULL;
@@ -125,10 +125,10 @@
   // These are gathered up first, so that we can allocate all pyramid levels
   // in a single buffer
   size_t buffer_size = 0;
-  size_t *layer_offsets = aom_calloc(n_levels, sizeof(size_t));
+  size_t *layer_offsets = aom_calloc(n_levels, sizeof(*layer_offsets));
   if (!layer_offsets) {
-    aom_free(pyr);
     aom_free(pyr->layers);
+    aom_free(pyr);
     return NULL;
   }
 
@@ -195,8 +195,8 @@
   pyr->buffer_alloc =
       aom_memalign(PYRAMID_ALIGNMENT, buffer_size * sizeof(*pyr->buffer_alloc));
   if (!pyr->buffer_alloc) {
-    aom_free(pyr);
     aom_free(pyr->layers);
+    aom_free(pyr);
     aom_free(layer_offsets);
     return NULL;
   }
@@ -250,7 +250,7 @@
 
 // Compute coarse to fine pyramids for a frame
 // This must only be called while holding frame_pyr->mutex
-static INLINE void fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
                                 ImagePyramid *frame_pyr) {
   int n_levels = frame_pyr->n_levels;
   const int frame_width = frame->y_crop_width;
@@ -312,11 +312,13 @@
     // 2) Up/downsampling by a factor of 2 can be implemented much more
     //    efficiently than up/downsampling by a generic ratio.
     //    TODO(rachelbarker): Use optimized downsample-by-2 function
-    av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
-                     prev_stride, this_buffer, this_height, this_width,
-                     this_stride);
+    if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
+                          prev_stride, this_buffer, this_height, this_width,
+                          this_stride))
+      return false;
     fill_border(this_buffer, this_width, this_height, this_stride);
   }
+  return true;
 }
 
 // Fill out a downsampling pyramid for a given frame.
@@ -331,7 +333,7 @@
 //
 // However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
 // we will still construct the top level.
-void aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
                          ImagePyramid *pyr) {
   assert(pyr);
 
@@ -344,9 +346,9 @@
 #endif  // CONFIG_MULTITHREAD
 
   if (!pyr->valid) {
-    fill_pyramid(frame, bit_depth, pyr);
-    pyr->valid = true;
+    pyr->valid = fill_pyramid(frame, bit_depth, pyr);
   }
+  bool valid = pyr->valid;
 
   // At this point, the pyramid is guaranteed to be valid, and can be safely
   // read from without holding the mutex any more
@@ -354,6 +356,7 @@
 #if CONFIG_MULTITHREAD
   pthread_mutex_unlock(&pyr->mutex);
 #endif  // CONFIG_MULTITHREAD
+  return valid;
 }
 
 #ifndef NDEBUG
diff --git a/aom_dsp/pyramid.h b/aom_dsp/pyramid.h
index 812aae1..9442a1f 100644
--- a/aom_dsp/pyramid.h
+++ b/aom_dsp/pyramid.h
@@ -100,7 +100,7 @@
 //
 // However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
 // we will still construct the top level.
-void aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
                          ImagePyramid *pyr);
 
 #ifndef NDEBUG
diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 8dd5b0b..e5c960b 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -11,6 +11,7 @@
 
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
 
 #if !CONFIG_REALTIME_ONLY
 void aom_quantize_b_adaptive_helper_c(
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 341a5ff..8d69e3b 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -257,32 +257,32 @@
            highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
-#define HIGHBD_SAD_MXNX4D(m, n)                                              \
-  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
-  }                                                                          \
-  void aom_highbd_sad_skip_##m##x##n##x4d_c(                                 \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i],       \
-                                    2 * ref_stride, (m), (n / 2));           \
-    }                                                                        \
+#define HIGHBD_SAD_MXNX4D(m, n)                                                \
+  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,              \
+                                                 ref_array[i], ref_stride);    \
+    }                                                                          \
+  }                                                                            \
+  void aom_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i],         \
+                                    2 * ref_stride, (m), (n / 2));             \
+    }                                                                          \
   }
 // Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
-#define HIGHBD_SAD_MXNX3D(m, n)                                              \
-  void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride,     \
-                                 sad_array);                                 \
+#define HIGHBD_SAD_MXNX3D(m, n)                                                \
+  void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride,       \
+                                 sad_array);                                   \
   }
 
 // 128x128
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
deleted file mode 100644
index 6488de7..0000000
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ /dev/null
@@ -1,977 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/simd/v64_intrinsics_arm.h"
-
-typedef int64x2_t v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
-  return v64_low_u32(vget_low_s64(a));
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64(vcreate_s64(b), vcreate_s64(a));
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-  return v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
-  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
-  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-  return c ? vreinterpretq_s64_s8(
-                 vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
-           : b;
-#else
-  return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
-                               v64_align(v128_high_v64(b), v128_low_v64(b), c))
-               : v128_from_v64(
-                     v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
-                     v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
-#endif
-}
-
-SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
-
-SIMD_INLINE v128 v128_ones(void) {
-  return vreinterpretq_s64_u8(vdupq_n_u8(-1));
-}
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) {
-  return vreinterpretq_s64_u8(vdupq_n_u8(x));
-}
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) {
-  return vreinterpretq_s64_u16(vdupq_n_u16(x));
-}
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) {
-  return vreinterpretq_s64_u32(vdupq_n_u32(x));
-}
-
-SIMD_INLINE v128 v128_dup_64(uint64_t x) {
-  return vreinterpretq_s64_u64(vdupq_n_u64(x));
-}
-
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
-  int16x8_t t1 = vmulq_s16(
-      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
-  int16x8_t t2 = vmulq_s16(
-      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
-#if AOM_ARCH_AARCH64
-  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
-#else
-  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
-  return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0);
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
-         v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
-}
-
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
-  int64x2_t t = vpaddlq_s32(
-      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-  return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0);
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
-#if AOM_ARCH_AARCH64
-  return vaddlvq_u8(vreinterpretq_u8_s64(x));
-#else
-  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
-  return vget_lane_s32(
-      vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-#endif
-}
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
-  return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_padd_u8(v128 a) {
-  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
-}
-
-typedef struct {
-  sad64_internal hi, lo;
-} sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
-  sad128_internal s;
-  s.hi = s.lo = vdupq_n_u16(0);
-  return s;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  sad128_internal r;
-  r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
-  r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
-  return r;
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-#if AOM_ARCH_AARCH64
-  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
-#else
-  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
-  return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)),
-                                 0);
-#endif
-}
-
-typedef struct {
-  ssd64_internal hi, lo;
-} ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
-  ssd128_internal s;
-  s.hi = s.lo = v64_ssd_u8_init();
-  return s;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  ssd128_internal r;
-  r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
-  r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
-  return r;
-}
-
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
-}
-
-SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
-
-SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
-
-SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
-
-SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
-
-SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
-  return vreinterpretq_s64_u64(
-      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
-
-SIMD_INLINE v128 v128_abs_s16(v128 x) {
-  return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
-}
-
-SIMD_INLINE v128 v128_abs_s8(v128 x) {
-  return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
-}
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
-  return vreinterpretq_s64_s32(
-      vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return vreinterpretq_s64_s16(
-      vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_s16(vuzp2q_s16(
-      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
-                                      vreinterpret_s16_s64(vget_low_s64(b)))),
-      vreinterpretq_s16_s32(
-          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
-#else
-  return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
-                       v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-  return vreinterpretq_s64_s32(
-      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
-#if AOM_ARCH_AARCH64
-  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
-                           vreinterpret_s16_s64(vget_low_s64(b)));
-  int32x4_t t2 =
-      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
-  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
-#else
-  return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
-                       v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if AOM_ARCH_AARCH64
-  int16x8_t t1 = vmulq_s16(
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
-      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
-  int16x8_t t2 = vmulq_s16(
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
-      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
-  return vreinterpretq_s64_s16(
-      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
-#else
-  return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
-                       v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
-  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
-#if AOM_ARCH_AARCH64
-  uint8x16_t m =
-      vandq_u8(vreinterpretq_u8_s64(a),
-               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
-  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
-#else
-  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
-      vandq_u8(vreinterpretq_u8_s64(a),
-               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
-  int64x2_t s = vreinterpretq_s64_u64(m);
-  return v64_low_u32(v64_ziplo_8(vget_high_s64(s), vget_low_s64(s)));
-#endif
-}
-
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
-  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
-  return v128_or(v128_and(b, c), v128_andn(a, c));
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u8(
-      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u8(
-      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u16(
-      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
-  return vreinterpretq_s64_s16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u16(
-      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
-  return vreinterpretq_s64_s16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
-  uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u32(
-      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
-  return vreinterpretq_s64_s32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u32(
-      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
-  return vreinterpretq_s64_s32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
-  uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
-  return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return v128_from_v64(vget_low_s64(a), vget_low_s64(b));
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return v128_from_v64(vget_high_s64(a), vget_high_s64(b));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u8(
-      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u8(
-      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u16(
-      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  uint16x8x2_t r =
-      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
-  return vreinterpretq_s64_u16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u16(
-      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  uint16x8x2_t r =
-      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
-  return vreinterpretq_s64_u16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u32(
-      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  uint32x4x2_t r =
-      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
-  return vreinterpretq_s64_u32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u32(
-      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  uint32x4x2_t r =
-      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
-  return vreinterpretq_s64_u32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
-  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
-  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
-  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
-      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
-      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
-      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
-      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
-  return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
-  return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return vreinterpretq_s64_u32(
-      vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return vreinterpretq_s64_s32(
-      vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return vreinterpretq_s64_u32(
-      vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return vreinterpretq_s64_s32(
-      vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if AOM_ARCH_AARCH64
-  return vreinterpretq_s64_u8(
-      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
-#else
-  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
-                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
-  uint8x8_t shuffle_hi =
-      vtbl2_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern)));
-  uint8x8_t shuffle_lo =
-      vtbl2_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern)));
-  return v128_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle_hi), 0),
-                      vget_lane_u64(vreinterpret_u64_u8(shuffle_lo), 0));
-#endif
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero()
-                 : vreinterpretq_s64_u8(vshlq_u8(vreinterpretq_u8_s64(a),
-                                                 vdupq_n_s8((int8_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero()
-                 : vreinterpretq_s64_u8(vshlq_u8(vreinterpretq_u8_s64(a),
-                                                 vdupq_n_s8(-(int8_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_ones()
-                 : vreinterpretq_s64_s8(vshlq_s8(vreinterpretq_s8_s64(a),
-                                                 vdupq_n_s8(-(int8_t)c)));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_zero()
-                  : vreinterpretq_s64_u16(vshlq_u16(vreinterpretq_u16_s64(a),
-                                                    vdupq_n_s16((int16_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_zero()
-                  : vreinterpretq_s64_u16(vshlq_u16(vreinterpretq_u16_s64(a),
-                                                    vdupq_n_s16(-(int16_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_ones()
-                  : vreinterpretq_s64_s16(vshlq_s16(vreinterpretq_s16_s64(a),
-                                                    vdupq_n_s16(-(int16_t)c)));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_zero()
-                  : vreinterpretq_s64_u32(vshlq_u32(vreinterpretq_u32_s64(a),
-                                                    vdupq_n_s32((int32_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_zero()
-                  : vreinterpretq_s64_u32(vshlq_u32(vreinterpretq_u32_s64(a),
-                                                    vdupq_n_s32(-(int32_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_ones()
-                  : vreinterpretq_s64_s32(vshlq_s32(vreinterpretq_s32_s64(a),
-                                                    vdupq_n_s32(-(int32_t)c)));
-}
-
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
-  return (c > 63) ? v128_zero()
-                  : vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a),
-                                                    vdupq_n_s64((int64_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
-  return (c > 63) ? v128_zero()
-                  : vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a),
-                                                    vdupq_n_s64(-(int64_t)c)));
-}
-
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
-  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-(int64_t)c));
-}
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   (uint64_t)vorr_u64(
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  n * 8),
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                  (8 - n) * 8)),
-                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                        n * 8))
-             : (n == 8 ? v128_from_64(
-                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
-                       : v128_from_64((uint64_t)vshl_n_u64(
-                                          vreinterpret_u64_s64(vget_low_s64(a)),
-                                          (n - 8) * 8),
-                                      0));
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  return n == 0
-             ? a
-             : (n < 8
-                    ? v128_from_64(
-                          (uint64_t)vshr_n_u64(
-                              vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
-                          (uint64_t)vorr_u64(
-                              vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                         n * 8),
-                              vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                         (8 - n) * 8)))
-                    : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
-                                                    vget_high_s64(a)))
-                              : v128_from_64(0, (uint64_t)vshr_n_u64(
-                                                    vreinterpret_u64_s64(
-                                                        vget_high_s64(a)),
-                                                    (n - 8) * 8))));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
-  return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
-           : a;
-}
-
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
-  return c ? vshrq_n_s64(a, c) : a;
-}
-
-#else
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
-  if (n < 8)
-    return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
-                                v64_shr_n_byte(v128_low_v64(a), 8 - n)),
-                         v64_shl_n_byte(v128_low_v64(a), n));
-  else
-    return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  if (n < 8)
-    return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
-                         v64_or(v64_shr_n_byte(v128_low_v64(a), n),
-                                v64_shl_n_byte(v128_high_v64(a), 8 - n)));
-  else
-    return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
-  return v128_shl_8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
-  return v128_shr_u8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
-  return v128_shr_s8(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
-  return v128_shl_16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
-  return v128_shr_u16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
-  return v128_shr_s16(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
-  return v128_shl_32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
-  return v128_shr_u32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
-  return v128_shr_s32(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
-  return v128_shl_64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
-  return v128_shr_u64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
-  return v128_shr_s64(a, c);
-}
-
-#endif
-
-typedef uint32x4_t sad128_internal_u16;
-
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
-  return vdupq_n_u32(0);
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
-                                             v128 b) {
-  return vaddq_u32(
-      s, vpaddlq_u16(vsubq_u16(
-             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
-             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
-}
-
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
-  uint64x2_t t = vpaddlq_u32(s);
-  return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)),
-                                 0);
-}
-
-typedef v128 ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
-                                             v128 b) {
-  v128 d = v128_sub_16(a, b);
-  d = v128_madd_s16(d, d);
-  return v128_add_64(
-      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
-}
-
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
-  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/aom_dsp/simd/v256_intrinsics_arm.h b/aom_dsp/simd/v256_intrinsics_arm.h
deleted file mode 100644
index bd86ea1..0000000
--- a/aom_dsp/simd/v256_intrinsics_arm.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
-
-#include "aom_dsp/simd/v256_intrinsics_v128.h"
-
-#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
deleted file mode 100644
index 8d07c34..0000000
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
-
-#include <arm_neon.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/simd/v64_intrinsics_arm.h"
-#include "aom_ports/arm.h"
-
-#ifdef AOM_INCOMPATIBLE_GCC
-#error Incompatible gcc
-#endif
-
-typedef int64x1_t v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
-  return vget_lane_u32(vreinterpret_u32_s64(a), 0);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
-  return vget_lane_u32(vreinterpret_u32_s64(a), 1);
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) {
-  return vget_lane_s32(vreinterpret_s32_s64(a), 0);
-}
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
-  return vget_lane_s32(vreinterpret_s32_s64(a), 1);
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
-                     d);
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return vcreate_s64((uint64_t)x << 32 | y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
-
-SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)vget_lane_s64(x, 0); }
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if defined(__clang__)
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
-                0);
-#elif defined(__CC_ARM)
-  *(__packed uint32_t *)p) = a;
-#elif defined(__GNUC__)
-  struct Unaligned32Struct {
-    uint32_t value;
-    uint8_t dummy;  // To make the size non-power-of-two.
-  } __attribute__((__packed__));
-  ((struct Unaligned32Struct *)p)->value = a;
-#else
-  memcpy(p, &a, 4);
-#endif
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
-  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
-  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-// The following function requires an immediate.
-// Some compilers will check this if it's optimising, others wont.
-SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-  return c ? vreinterpret_s64_s8(
-                 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
-           : b;
-#else
-  return c ? v64_from_64(((uint64_t)vget_lane_s64(b, 0) >> c * 8) |
-                         ((uint64_t)vget_lane_s64(a, 0) << (8 - c) * 8))
-           : b;
-#endif
-}
-
-SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) {
-  return vreinterpret_s64_u8(vdup_n_u8(x));
-}
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) {
-  return vreinterpret_s64_u16(vdup_n_u16(x));
-}
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) {
-  return vreinterpret_s64_u32(vdup_n_u32(x));
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
-  int16x8_t t =
-      vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
-                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
-#if AOM_ARCH_AARCH64
-  return vaddlvq_s16(t);
-#else
-  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
-  return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0);
-#endif
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vaddlvq_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-#else
-  int64x2_t r =
-      vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-#if AOM_ARCH_AARCH64
-  return vaddlv_u8(vreinterpret_u8_s64(x));
-#else
-  return vget_lane_u64(
-      vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))), 0);
-#endif
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return vget_lane_s64(vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))), 0);
-}
-
-typedef uint16x8_t sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); }
-
-// Implementation dependent return value. Result must be finalised with
-// v64_sad_u8_sum().
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-#if AOM_ARCH_AARCH64
-  return vaddlvq_u16(s);
-#else
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
-  return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(r), vget_low_u64(r)),
-                                 0);
-#endif
-}
-
-typedef uint32x4_t ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); }
-
-// Implementation dependent return value. Result must be finalised with
-// v64_ssd_u8_sum().
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-#if AOM_ARCH_AARCH64
-  return vaddvq_u32(s);
-#else
-  uint64x2_t t = vpaddlq_u32(s);
-  return vget_lane_u32(
-      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-#endif
-}
-
-SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
-
-SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
-
-SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
-
-SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
-
-SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
-  return vreinterpret_s64_u32(
-      vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
-  return vreinterpret_s64_s32(
-      vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_abs_s16(v64 x) {
-  return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
-}
-
-SIMD_INLINE v64 v64_abs_s8(v64 x) {
-  return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
-}
-
-SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  int16x8_t t = vreinterpretq_s16_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
-#else
-  return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
-#endif
-}
-
-SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
-  return vreinterpret_s64_s32(
-      vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
-  int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
-  return vreinterpret_s64_s32(
-      vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
-                vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
-}
-
-SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
-  int16x8_t t =
-      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
-                vmovl_s8(vreinterpret_s8_s64(y)));
-  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u8(
-      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u8(
-      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u16(
-      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
-  return vreinterpret_s64_s16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u16(
-      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
-  return vreinterpret_s64_s16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u32(
-      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
-#else
-  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
-  return vreinterpret_s64_s32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u32(
-      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
-#else
-  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
-  return vreinterpret_s64_s32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
-  return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
-  return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
-  return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
-  return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(vqmovun_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u8(
-      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u8(
-      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u16(
-      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpret_s64_u16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
-#if AOM_ARCH_AARCH64
-  return vreinterpret_s64_u16(
-      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpret_s64_u16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
-  return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
-  return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
-  return vreinterpret_s64_s32(
-      vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
-  return vreinterpret_s64_u32(
-      vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
-  return vreinterpret_s64_u8(
-      vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(
-      vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8((int8_t)c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(
-      vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-(int8_t)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  return vreinterpret_s64_s8(
-      vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-(int8_t)c)));
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(
-      vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16((int16_t)c)));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(
-      vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int16_t)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return vreinterpret_s64_s16(
-      vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int16_t)c)));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(
-      vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32((int32_t)c)));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(
-      vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int32_t)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return vreinterpret_s64_s32(
-      vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int32_t)c)));
-}
-
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
-  return vshl_n_s64(a, c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
-  return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a;
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a;
-}
-
-#else
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
-  return v64_from_64(v64_u64(a) << c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
-  return v64_from_64(v64_u64(a) >> c * 8);
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return v64_shr_u16(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return v64_shr_s16(a, c);
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return v64_shr_u32(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return v64_shr_s32(a, c);
-}
-
-#endif
-
-#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/aom_dsp/sse.c b/aom_dsp/sse.c
index 16f6b58..bfe76ed 100644
--- a/aom_dsp/sse.c
+++ b/aom_dsp/sse.c
@@ -9,7 +9,12 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-/* Sum the difference between every corresponding element of the buffers. */
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 63c1e5f..f02c307 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -1058,7 +1058,7 @@
 }
 
 #define HIGHBD_OBMC_VAR(W, H)                                              \
-  unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
+  unsigned int aom_highbd_8_obmc_variance##W##x##H##_c(                    \
       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
       const int32_t *mask, unsigned int *sse) {                            \
     int sum;                                                               \
@@ -1087,7 +1087,7 @@
   }
 
 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
-  unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
+  unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c(              \
       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
     uint16_t fdata3[(H + 1) * W];                                              \
@@ -1098,8 +1098,8 @@
     aom_highbd_var_filter_block2d_bil_second_pass(                             \
         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
                                                                                \
-    return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                                 wsrc, mask, sse);             \
+    return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                   W, wsrc, mask, sse);        \
   }                                                                            \
                                                                                \
   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 5823059..245fda1 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -23,32 +23,6 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
 
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3,
-                                                              3, 4, 2, 3, 3, 4,
-                                                              4, 5, 5, 6 };
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7,
-                                                              7, 8, 6, 7, 7, 8,
-                                                              8, 9, 9, 10 };
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4,
-                                      4, 5, 5, 6, 6, 7, 7, 8 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6,
-                                      6, 7, 7, 8, 8, 9, 9, 10 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt3_global[16]) = { 4, 5, 5, 6,  6,  7,  7,  8,
-                                      8, 9, 9, 10, 10, 11, 11, 12 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt4_global[16]) = { 6,  7,  7,  8,  8,  9,  9,  10,
-                                      10, 11, 11, 12, 12, 13, 13, 14 };
-
 DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
   0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
   2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
@@ -64,11 +38,6 @@
   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
 };
 
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-
 static void aom_filter_block1d4_h4_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -196,74 +165,6 @@
   }
 }
 
-void aom_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
 static void aom_filter_block1d8_h4_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -403,168 +304,6 @@
   }
 }
 
-void aom_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += out_pitch;
-  }
-}
-
 static void aom_filter_block1d16_h4_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index ca2752e..9ab9143 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -25,6 +25,11 @@
   *out_hi = _mm_unpackhi_epi16(in, sign_bits);
 }
 
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi32(a, sign);
+}
+
 void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -583,21 +588,14 @@
 int aom_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   __m128i accum = zero;
 
-  for (i = 0; i < length; i += 16) {
-    const __m128i src_line0 = load_tran_low(coeff);
-    const __m128i src_line1 = load_tran_low(coeff + 8);
-    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
-    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
-    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
-    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
-    const __m128i sum0 = _mm_madd_epi16(abs0, one);
-    const __m128i sum1 = _mm_madd_epi16(abs1, one);
-    accum = _mm_add_epi32(accum, sum0);
-    accum = _mm_add_epi32(accum, sum1);
-    coeff += 16;
+  for (i = 0; i < length; i += 4) {
+    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
+    const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
+    accum = _mm_add_epi32(accum, abs_coeff);
+    coeff += 4;
   }
 
   {  // cascading summation of accum
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index b4ff697..4ca214f 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
diff --git a/aom_dsp/x86/fft_sse2.c b/aom_dsp/x86/fft_sse2.c
index c6023af..bdd235b 100644
--- a/aom_dsp/x86/fft_sse2.c
+++ b/aom_dsp/x86/fft_sse2.c
@@ -28,6 +28,9 @@
   _mm_store_ps(&B[3 * ldb], row4);
 }
 
+// Referenced by fft_avx2.c.
+void aom_transpose_float_sse2(const float *A, float *B, int n);
+
 void aom_transpose_float_sse2(const float *A, float *B, int n) {
   for (int y = 0; y < n; y += 4) {
     for (int x = 0; x < n; x += 4) {
@@ -36,6 +39,9 @@
   }
 }
 
+// Referenced by fft_avx2.c.
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n);
+
 void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
   const int n2 = n / 2;
   output[0] = packed[0];
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c
index 8361e2f..11e4577 100644
--- a/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -11,7 +11,7 @@
 #include <immintrin.h>
 #include <string.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
diff --git a/aom_dsp/x86/highbd_convolve_ssse3.c b/aom_dsp/x86/highbd_convolve_ssse3.c
index 21389db..31c3c31 100644
--- a/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index a5c450a..3b0c42c 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -14,6 +14,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
                                 const int16_t *zbin_ptr,
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index e11754e..6c78eee 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -604,7 +604,7 @@
 
 static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
     int M, int N, int D, const uint8_t *src, int src_stride,
-    const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
+    const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
@@ -639,26 +639,26 @@
 
 #define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                          \
   void aom_highbd_sad##m##x##n##x4d_avx2(                                     \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],   \
-      int ref_stride, uint32_t *sad_array) {                                  \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
     aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
                              sad_array);                                      \
   }
-#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                   \
-  void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                             \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
-      int ref_stride, uint32_t *sad_array) {                                \
-    aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
-                             2 * ref_stride, sad_array);                    \
-    sad_array[0] <<= 1;                                                     \
-    sad_array[1] <<= 1;                                                     \
-    sad_array[2] <<= 1;                                                     \
-    sad_array[3] <<= 1;                                                     \
+#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                    \
+  void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array,  \
+                             2 * ref_stride, sad_array);                     \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
   }
 #define HIGHBD_SAD_MXNX3D_AVX2(m, n)                                          \
   void aom_highbd_sad##m##x##n##x3d_avx2(                                     \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],   \
-      int ref_stride, uint32_t *sad_array) {                                  \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
     aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
                              sad_array);                                      \
   }
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 36e6473..b4ff91d 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -729,14 +729,16 @@
 VAR_FN(16, 32, 16, 9)
 VAR_FN(16, 16, 16, 8)
 VAR_FN(16, 8, 8, 7)
-VAR_FN(16, 4, 16, 6)
-VAR_FN(8, 32, 8, 8)
-VAR_FN(32, 8, 8, 8)
-VAR_FN(16, 64, 16, 10)
-VAR_FN(64, 16, 16, 10)
 VAR_FN(8, 16, 8, 7)
 VAR_FN(8, 8, 8, 6)
 
+#if !CONFIG_REALTIME_ONLY
+VAR_FN(16, 64, 16, 10)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(64, 16, 16, 10)
+VAR_FN(8, 32, 8, 8)
+#endif  // !CONFIG_REALTIME_ONLY
+
 #undef VAR_FN
 
 #define SSE2_HEIGHT(H)                                                 \
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 621ef7a..242a548 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -11,7 +11,7 @@
 
 #include <immintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 #include "aom_dsp/x86/intrapred_x86.h"
 #include "aom_dsp/x86/intrapred_utils.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
diff --git a/aom_dsp/x86/intrapred_sse4.c b/aom_dsp/x86/intrapred_sse4.c
index fb30420..9de8bf3 100644
--- a/aom_dsp/x86/intrapred_sse4.c
+++ b/aom_dsp/x86/intrapred_sse4.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>  // SSE2
 #include <smmintrin.h>  /* SSE4.1 */
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 #include "aom_dsp/x86/intrapred_x86.h"
 #include "aom_dsp/x86/intrapred_utils.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
diff --git a/aom_dsp/x86/obmc_sad_avx2.c b/aom_dsp/x86/obmc_sad_avx2.c
index 2aa2a05..9d1b7d4 100644
--- a/aom_dsp/x86/obmc_sad_avx2.c
+++ b/aom_dsp/x86/obmc_sad_avx2.c
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
diff --git a/aom_dsp/x86/obmc_sad_sse4.c b/aom_dsp/x86/obmc_sad_sse4.c
index 0338a8c..542572c 100644
--- a/aom_dsp/x86/obmc_sad_sse4.c
+++ b/aom_dsp/x86/obmc_sad_sse4.c
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
diff --git a/aom_dsp/x86/obmc_variance_avx2.c b/aom_dsp/x86/obmc_variance_avx2.c
index b2df8a9..c23d8c4 100644
--- a/aom_dsp/x86/obmc_variance_avx2.c
+++ b/aom_dsp/x86/obmc_variance_avx2.c
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index aa73c39..89b050e 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
@@ -258,10 +259,10 @@
   *sse += xx_hsum_epi32_si64(v_sse_d);
 }
 
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, int w, int h,
-                                        unsigned int *sse, int *sum) {
+static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int w, int h,
+                                          unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
   if (w == 4) {
@@ -328,11 +329,11 @@
 }
 
 #define HBD_OBMCVARWXH(W, H)                                               \
-  unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
+  unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1(               \
       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
       const int32_t *mask, unsigned int *sse) {                            \
     int sum;                                                               \
-    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
   }                                                                        \
                                                                            \
diff --git a/aom_dsp/x86/sse_avx2.c b/aom_dsp/x86/sse_avx2.c
index e6ee2fc..c5a5f5c 100644
--- a/aom_dsp/x86/sse_avx2.c
+++ b/aom_dsp/x86/sse_avx2.c
@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #include <smmintrin.h>
 #include <immintrin.h>
 
+#include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
@@ -85,6 +87,7 @@
   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
+
 static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, __m256i *sum) {
   const __m128i v_a0 = xx_loadl_64(a);
@@ -96,6 +99,7 @@
   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
+
 int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
                      int b_stride, int width, int height) {
   int32_t y = 0;
@@ -249,6 +253,7 @@
   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
+
 int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
                             int b_stride, int width, int height) {
   int32_t y = 0;
diff --git a/aom_dsp/x86/sse_sse4.c b/aom_dsp/x86/sse_sse4.c
index 5f95eb9..7e74554 100644
--- a/aom_dsp/x86/sse_sse4.c
+++ b/aom_dsp/x86/sse_sse4.c
@@ -13,6 +13,7 @@
 #include <smmintrin.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
@@ -62,6 +63,7 @@
   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
 }
+
 static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
                                __m128i *sum) {
   const __m128i v_a0 = xx_loadl_64(a);
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index 4d6ee6a..b729e5f 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -62,8 +62,8 @@
 }
 
 static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
-  __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
-  __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   return yy_set_m128i(mhi, mlo);
 }
 
diff --git a/aom_ports/aarch32_cpudetect.c b/aom_ports/aarch32_cpudetect.c
new file mode 100644
index 0000000..753f957
--- /dev/null
+++ b/aom_ports/aarch32_cpudetect.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(_MSC_VER)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  // MSVC has no inline __asm support for Arm, but it does let you __emit
+  // instructions via their assembled hex code.
+  // All of these instructions should be essentially nops.
+  __try {
+    // VORR q0,q0,q0
+    __emit(0xF2200150);
+    flags |= HAS_NEON;
+  } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    // Ignore exception.
+  }
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  uint64_t features = android_getCpuFeatures();
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define AOM_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON
+  if (hwcap & AOM_AARCH32_HWCAP_NEON) flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+#else   // end __linux__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
+#endif
+
+int aom_arm_cpu_caps(void) {
+  int flags = 0;
+  if (arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/aom_ports/aarch64_cpudetect.c b/aom_ports/aarch64_cpudetect.c
new file mode 100644
index 0000000..43d5a14
--- /dev/null
+++ b/aom_ports/aarch64_cpudetect.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE bool have_feature(const char *feature) {
+  int64_t feature_present = 0;
+  size_t size = sizeof(feature_present);
+  if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+    return false;
+  }
+  return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+#if HAVE_ARM_CRC32
+  if (have_feature("hw.optional.armv8_crc32")) flags |= HAS_ARM_CRC32;
+#endif  // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+  if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= HAS_NEON_DOTPROD;
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (have_feature("hw.optional.arm.FEAT_I8MM")) flags |= HAS_NEON_I8MM;
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#elif defined(_WIN32)  // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_ARM_CRC32
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_ARM_CRC32;
+  }
+#endif  // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_NEON_DOTPROD
+  // No I8MM or SVE feature detection available on Windows at time of writing.
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define AOM_AARCH64_HWCAP_CRC32 (1 << 7)
+#define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define AOM_AARCH64_HWCAP_SVE (1 << 22)
+#define AOM_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_ARM_CRC32
+  if (hwcap & AOM_AARCH64_HWCAP_CRC32) flags |= HAS_ARM_CRC32;
+#endif  // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+  if (hwcap & AOM_AARCH64_HWCAP_ASIMDDP) flags |= HAS_NEON_DOTPROD;
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (hwcap2 & AOM_AARCH64_HWCAP2_I8MM) flags |= HAS_NEON_I8MM;
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE;
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#elif defined(__Fuchsia__)  // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  uint32_t features;
+  zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK) return flags;
+#if HAVE_ARM_CRC32
+  if (features & ZX_ARM64_FEATURE_ISA_CRC32) flags |= HAS_ARM_CRC32;
+#endif  // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+  if (features & ZX_ARM64_FEATURE_ISA_DP) flags |= HAS_NEON_DOTPROD;
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM) flags |= HAS_NEON_I8MM;
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (features & ZX_ARM64_FEATURE_ISA_SVE) flags |= HAS_SVE;
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#else  // end __Fuchsia__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
+#endif
+
+int aom_arm_cpu_caps(void) {
+  int flags = 0;
+  if (!arm_cpu_env_flags(&flags)) {
+    flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+  }
+
+  // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+  if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_NEON_I8MM;
+
+  // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available.
+  if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE;
+  if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE;
+
+  return flags;
+}
diff --git a/aom_ports/aom_ports.cmake b/aom_ports/aom_ports.cmake
index e3b67e4..8fd2ffd 100644
--- a/aom_ports/aom_ports.cmake
+++ b/aom_ports/aom_ports.cmake
@@ -24,8 +24,10 @@
 
 list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
 
-list(APPEND AOM_PORTS_SOURCES_ARM "${AOM_ROOT}/aom_ports/arm.h"
-            "${AOM_ROOT}/aom_ports/arm_cpudetect.c")
+list(APPEND AOM_PORTS_SOURCES_AARCH32
+            "${AOM_ROOT}/aom_ports/aarch32_cpudetect.c")
+list(APPEND AOM_PORTS_SOURCES_AARCH64
+            "${AOM_ROOT}/aom_ports/aarch64_cpudetect.c")
 
 if(CONFIG_RUNTIME_CPU_DETECT AND ANDROID_NDK)
   include_directories(${ANDROID_NDK}/sources/android/cpufeatures)
@@ -57,8 +59,11 @@
   elseif(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
     add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
     set(aom_ports_has_symbols 1)
+  elseif("${AOM_TARGET_CPU}" STREQUAL "arm64")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH64})
+    set(aom_ports_has_symbols 1)
   elseif("${AOM_TARGET_CPU}" MATCHES "arm")
-    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH32})
     set(aom_ports_has_symbols 1)
   elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
     add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
diff --git a/aom_ports/arm.h b/aom_ports/arm.h
index cb1fb9b..853741d 100644
--- a/aom_ports/arm.h
+++ b/aom_ports/arm.h
@@ -19,12 +19,16 @@
 extern "C" {
 #endif
 
-/*ARMv5TE "Enhanced DSP" instructions.*/
-#define HAS_EDSP 0x01
-/*ARMv6 "Parallel" or "Media" instructions.*/
-#define HAS_MEDIA 0x02
-/*ARMv7 optional NEON instructions.*/
-#define HAS_NEON 0x04
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.0-A optional CRC32 instructions, mandatory from Armv8.1-A.
+#define HAS_ARM_CRC32 (1 << 1)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 2)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 3)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 4)
 
 int aom_arm_cpu_caps(void);
 
diff --git a/aom_ports/arm_cpudetect.c b/aom_ports/arm_cpudetect.c
deleted file mode 100644
index 276ef61..0000000
--- a/aom_ports/arm_cpudetect.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include "aom_ports/arm.h"
-#include "config/aom_config.h"
-
-#ifdef WINAPI_FAMILY
-#include <winapifamily.h>
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define getenv(x) NULL
-#endif
-#endif
-
-static int arm_cpu_env_flags(int *flags) {
-  char *env;
-  env = getenv("AOM_SIMD_CAPS");
-  if (env && *env) {
-    *flags = (int)strtol(env, NULL, 0);
-    return 0;
-  }
-  *flags = 0;
-  return -1;
-}
-
-static int arm_cpu_env_mask(void) {
-  char *env;
-  env = getenv("AOM_SIMD_CAPS_MASK");
-  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
-}
-
-#if !CONFIG_RUNTIME_CPU_DETECT || defined(__APPLE__)
-
-int aom_arm_cpu_caps(void) {
-  /* This function should actually be a no-op. There is no way to adjust any of
-   * these because the RTCD tables do not exist: the functions are called
-   * statically */
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-#if HAVE_NEON
-  flags |= HAS_NEON;
-#endif /* HAVE_NEON */
-  return flags & mask;
-}
-
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT || __APPLE__ */
-#if HAVE_NEON && !AOM_ARCH_AARCH64
-/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
-#undef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#undef WIN32_EXTRA_LEAN
-#define WIN32_EXTRA_LEAN
-#include <windows.h>
-#endif  // HAVE_NEON && !AOM_ARCH_AARCH64
-
-int aom_arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-#if AOM_ARCH_AARCH64
-  return HAS_NEON & mask;
-#else
-/* MSVC has no inline __asm support for ARM, but it does let you __emit
- *  instructions via their assembled hex code.
- * All of these instructions should be essentially nops.
- */
-#if HAVE_NEON
-  if (mask & HAS_NEON) {
-    __try {
-      /*VORR q0,q0,q0*/
-      __emit(0xF2200150);
-      flags |= HAS_NEON;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
-#endif  /* HAVE_NEON */
-  return flags & mask;
-#endif  // AOM_ARCH_AARCH64
-}
-
-#elif defined(__ANDROID__) /* end _MSC_VER */
-#include <cpu-features.h>
-
-int aom_arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  uint64_t features;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  features = android_getCpuFeatures();
-
-#if HAVE_NEON
-  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON */
-  return flags & mask;
-}
-
-#elif defined(__linux__) /* end __ANDROID__ */
-
-#include <stdio.h>
-
-int aom_arm_cpu_caps(void) {
-  FILE *fin;
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
-   *  on Android.
-   * This also means that detection will fail in Scratchbox.
-   */
-  fin = fopen("/proc/cpuinfo", "r");
-  if (fin != NULL) {
-    /* 512 should be enough for anybody (it's even enough for all the flags
-     * that x86 has accumulated... so far).
-     */
-    char buf[512];
-    while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON
-      if (memcmp(buf, "Features", 8) == 0) {
-        char *p;
-        p = strstr(buf, " neon");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_NEON;
-        }
-      }
-#endif /* HAVE_NEON */
-    }
-    fclose(fin);
-  }
-  return flags & mask;
-}
-#else  /* end __linux__ */
-#error \
-    "Runtime CPU detection selected, but no CPU detection method " \
-"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
-#endif
diff --git a/aom_ports/arm_cpudetect.h b/aom_ports/arm_cpudetect.h
new file mode 100644
index 0000000..33c2d1b
--- /dev/null
+++ b/aom_ports/arm_cpudetect.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/arm.h"
+#include "config/aom_config.h"
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define ANDROID_USE_CPU_FEATURES_LIB 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static bool arm_cpu_env_flags(int *flags) {
+  const char *env = getenv("AOM_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return true;
+  }
+  return false;
+}
+
+static int arm_cpu_env_mask(void) {
+  const char *env = getenv("AOM_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 3c5b992..7f4c165 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -13,7 +13,6 @@
 #define AOM_AOM_PORTS_BITOPS_H_
 
 #include <assert.h>
-#include <stdint.h>
 
 #include "aom_ports/msvc.h"
 #include "config/aom_config.h"
@@ -34,12 +33,8 @@
 // These versions of get_msb() are only valid when n != 0 because all
 // of the optimized versions are undefined when n == 0:
 
-// get_byteswap64:
-// Returns the number (uint64_t) with byte-positions reversed
-// e.g. input 0x123456789ABCDEF0 returns 0xF0DEBC9A78563412
-
 // GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
-// MSVC: https://learn.microsoft.com/en-us/cpp/c-runtime-library/
+// MSVC: https://learn.microsoft.com/en-us/cpp/intrinsics/compiler-intrinsics
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
@@ -48,10 +43,6 @@
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
-
-static INLINE uint64_t get_byteswap64(uint64_t num) {
-  return __builtin_bswap64(num);
-}
 #elif defined(USE_MSC_INTRINSICS)
 #pragma intrinsic(_BitScanReverse)
 
@@ -61,10 +52,6 @@
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-
-static INLINE uint64_t get_byteswap64(uint64_t num) {
-  return _byteswap_uint64(num);
-}
 #undef USE_MSC_INTRINSICS
 #else
 static INLINE int get_msb(unsigned int n) {
@@ -82,26 +69,6 @@
   }
   return log;
 }
-
-static INLINE uint64_t get_byteswap64(uint64_t num) {
-  uint64_t out = 0x00;
-  uint64_t mask = 0xFF00000000000000;
-  int bit_shift = 56;  // 7 bytes
-  // 4 ms bytes
-  do {
-    out |= (num & mask) >> bit_shift;
-    mask >>= 8;
-    bit_shift -= 16;
-  } while (bit_shift >= 0);
-  // 4 ls bytes
-  bit_shift = 8;  // 1 byte
-  do {
-    out |= (num & mask) << bit_shift;
-    mask >>= 8;
-    bit_shift += 16;
-  } while (bit_shift <= 56);
-  return out;
-}
 #endif
 
 #ifdef __cplusplus
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index e396842..a70ce82 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -24,16 +24,6 @@
 #define DECLARE_ALIGNED(n, typ, val) typ val
 #endif
 
-/* Indicates that the usage of the specified variable has been audited to assure
- * that it's safe to use uninitialized. Silences 'may be used uninitialized'
- * warnings on gcc.
- */
-#if defined(__GNUC__) && __GNUC__
-#define UNINITIALIZED_IS_SAFE(x) x = x
-#else
-#define UNINITIALIZED_IS_SAFE(x) x
-#endif
-
 #if HAVE_NEON && defined(_MSC_VER)
 #define __builtin_prefetch(x)
 #endif
diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index 82376f4..94b400b 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c
@@ -193,7 +193,9 @@
     if (num_pyramid_levels > 0) {
       ybf->y_pyramid = aom_alloc_pyramid(width, height, num_pyramid_levels,
                                          use_highbitdepth);
+      if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR;
       ybf->corners = av1_alloc_corner_list();
+      if (!ybf->corners) return AOM_CODEC_MEM_ERROR;
     }
 #endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 
diff --git a/aom_util/aom_thread.c b/aom_util/aom_thread.c
index 2c62b24..fa3b0a2 100644
--- a/aom_util/aom_thread.c
+++ b/aom_util/aom_thread.c
@@ -24,6 +24,7 @@
 #include <string.h>  // for memset()
 
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/sanitizer.h"
 #include "aom_util/aom_thread.h"
 
 #if CONFIG_MULTITHREAD
@@ -144,11 +145,30 @@
       pthread_mutex_destroy(&worker->impl_->mutex_);
       goto Error;
     }
+    pthread_attr_t attr;
+    if (pthread_attr_init(&attr)) goto Error2;
+      // Debug ASan builds require at least ~1MiB of stack; prevents
+      // failures on macOS arm64 where the default is 512KiB.
+      // See: https://crbug.com/aomedia/3379
+#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
+    !defined(NDEBUG)
+    size_t stacksize;
+    if (!pthread_attr_getstacksize(&attr, &stacksize)) {
+      const size_t kMinStackSize = 1 << 20;  // 1 MiB
+      if (stacksize < kMinStackSize &&
+          pthread_attr_setstacksize(&attr, kMinStackSize)) {
+        pthread_attr_destroy(&attr);
+        goto Error2;
+      }
+    }
+#endif
     pthread_mutex_lock(&worker->impl_->mutex_);
-    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+    ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
     if (ok) worker->status_ = OK;
     pthread_mutex_unlock(&worker->impl_->mutex_);
+    pthread_attr_destroy(&attr);
     if (!ok) {
+    Error2:
       pthread_mutex_destroy(&worker->impl_->mutex_);
       pthread_cond_destroy(&worker->impl_->condition_);
     Error:
diff --git a/aom_util/aom_thread.h b/aom_util/aom_thread.h
index 2df190f..ec2ea43 100644
--- a/aom_util/aom_thread.h
+++ b/aom_util/aom_thread.h
@@ -37,6 +37,7 @@
 #include <process.h>  // NOLINT
 #include <windows.h>  // NOLINT
 typedef HANDLE pthread_t;
+typedef int pthread_attr_t;
 typedef CRITICAL_SECTION pthread_mutex_t;
 
 #if _WIN32_WINNT < 0x0600
@@ -60,7 +61,18 @@
 #define THREADFN unsigned int __stdcall
 #define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
 
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+static INLINE int pthread_attr_init(pthread_attr_t *attr) {
+  (void)attr;
+  return 0;
+}
+
+static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
+  (void)attr;
+  return 0;
+}
+
+static INLINE int pthread_create(pthread_t *const thread,
+                                 const pthread_attr_t *attr,
                                  unsigned int(__stdcall *start)(void *),
                                  void *arg) {
   (void)attr;
diff --git a/apps/aomenc.c b/apps/aomenc.c
index 09306f2..c3f5c33 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -74,7 +74,10 @@
 
     if (detail) fprintf(stderr, "    %s\n", detail);
 
-    if (fatal) exit(EXIT_FAILURE);
+    if (fatal) {
+      aom_codec_destroy(ctx);
+      exit(EXIT_FAILURE);
+    }
   }
 }
 
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index 35a2ab4..0575654 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -303,7 +303,7 @@
       ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"),
 #if CONFIG_AV1_ENCODER
   .cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1,
-                          "Speed setting (0..6 in good mode, 5..10 in realtime "
+                          "Speed setting (0..6 in good mode, 5..11 in realtime "
                           "mode, 0..9 in all intra mode)"),
   .rowmtarg =
       ARG_DEF(NULL, "row-mt", 1,
diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index b9d0cfe..73c78ca 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h
@@ -21,7 +21,6 @@
 #include "common/webmenc.h"
 #endif
 #include "aom/aomcx.h"
-#include "aom_dsp/flow_estimation/flow_estimation.h"
 
 enum TestDecodeFatality {
   TEST_DECODE_OFF,
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 43b7665..1bb0539 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -267,7 +267,6 @@
             "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
@@ -319,7 +318,8 @@
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
-list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c"
+            "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
             "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
@@ -347,26 +347,30 @@
             "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
+            "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/picksrt_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
+            "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
-            "${AOM_ROOT}/av1/encoder/arm/crc32/hash_crc32.c")
+            "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
@@ -376,10 +380,10 @@
             "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/cdef_block_neon.c"
             "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+            "${AOM_ROOT}/av1/common/arm/compound_convolve_neon.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
             "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c"
-            "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
             "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
             "${AOM_ROOT}/av1/common/arm/reconintra_neon.c"
             "${AOM_ROOT}/av1/common/arm/resize_neon.c"
@@ -387,6 +391,18 @@
             "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
             "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c")
 
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD
+            "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM
+            "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c"
+            "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SVE
+            "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
             "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
 
@@ -446,7 +462,14 @@
               "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c")
 
   list(APPEND AOM_AV1_COMMON_INTRIN_NEON
-              "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c")
+              "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_convolve_horiz_rs_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c"
+              "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
               "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
@@ -459,6 +482,11 @@
               "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
               "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
               "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
+              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
+              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
 endif()
 
 if(CONFIG_ACCOUNTING)
@@ -623,31 +651,45 @@
   endif()
 
   if(HAVE_NEON)
-    if(AOM_AV1_COMMON_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                    "aom_av1_common"
-                                    "AOM_AV1_COMMON_INTRIN_NEON")
-    endif()
-
+    add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                  "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON")
     if(CONFIG_AV1_ENCODER)
-      if(AOM_AV1_ENCODER_INTRIN_NEON)
-        add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                      "aom_av1_encoder"
-                                      "AOM_AV1_ENCODER_INTRIN_NEON")
-      endif()
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_NEON")
     endif()
+  endif()
 
-    if(HAVE_ARM_CRC32)
-      if(CONFIG_AV1_ENCODER)
-        if(AOM_AV1_ENCODER_INTRIN_ARM_CRC32)
-          add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "crc32"
-                                        "aom_av1_encoder"
-                                        "AOM_AV1_ENCODER_INTRIN_ARM_CRC32")
-        endif()
-      endif()
+  if(HAVE_ARM_CRC32)
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "arm_crc32"
+                                    "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_ARM_CRC32")
     endif()
   endif()
 
+  if(HAVE_NEON_DOTPROD)
+    add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                  "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_NEON_DOTPROD")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                    "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD")
+    endif()
+  endif()
+
+  if(HAVE_NEON_I8MM)
+    add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+                                  "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_NEON_I8MM")
+  endif()
+
+  if(HAVE_SVE)
+    add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SVE")
+  endif()
+
   if(HAVE_VSX)
     if(AOM_AV1_COMMON_INTRIN_VSX)
       add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 182a098..3c8a39b 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -8,6 +8,7 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -543,6 +544,7 @@
   // Number of stats buffers required for look ahead
   int num_lap_buffers;
   STATS_BUFFER_CTX stats_buf_context;
+  bool monochrome_on_init;
 };
 
 static INLINE int gcd(int64_t a, int b) {
@@ -644,6 +646,18 @@
   if (cfg->g_forced_max_frame_height) {
     RANGE_CHECK_HI(cfg, g_h, cfg->g_forced_max_frame_height);
   }
+  // To avoid integer overflows when multiplying width by height (or values
+  // derived from width and height) using the int type, impose a maximum frame
+  // area (width * height) of 2^30.
+  const unsigned int max_frame_width =
+      cfg->g_forced_max_frame_width ? cfg->g_forced_max_frame_width : cfg->g_w;
+  const unsigned int max_frame_height = cfg->g_forced_max_frame_height
+                                            ? cfg->g_forced_max_frame_height
+                                            : cfg->g_h;
+  const int64_t max_frame_area = (int64_t)max_frame_width * max_frame_height;
+  if (max_frame_area > (1 << 30)) {
+    ERROR("max_frame_area out of range [..2^30]");
+  }
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
   RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
@@ -668,11 +682,7 @@
   RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
   RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS);
-  if (cfg->g_pass == AOM_RC_ONE_PASS) {
-    RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS);
-  } else {
-    RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
-  }
+  RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   if (cfg->g_usage == AOM_USAGE_ALL_INTRA) {
     RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
     RANGE_CHECK_HI(cfg, kf_max_dist, 0);
@@ -1505,6 +1515,12 @@
       force_key = 1;
   }
 
+  if (ctx->monochrome_on_init && cfg->monochrome == 0) {
+    // TODO(aomedia:3465): Allow this case to work without requiring re-init
+    // of encoder.
+    ERROR("Cannot change to monochrome = 0 after init with monochrome");
+  }
+
   // Prevent increasing lag_in_frames. This check is stricter than it needs
   // to be -- the limit is not increasing past the first lag_in_frames
   // value, but we don't track the initial config, only the last successful
@@ -1585,11 +1601,26 @@
     bool is_sb_size_changed = false;
     av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
     for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
-      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
-                        is_sb_size_changed);
+      AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
+      struct aom_internal_error_info *const error = cpi->common.error;
+      if (setjmp(error->jmp)) {
+        error->setjmp = 0;
+        return error->error_code;
+      }
+      error->setjmp = 1;
+      av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
+      error->setjmp = 0;
     }
     if (ctx->ppi->cpi_lap != NULL) {
-      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
+      AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
+      struct aom_internal_error_info *const error = cpi_lap->common.error;
+      if (setjmp(error->jmp)) {
+        error->setjmp = 0;
+        return error->error_code;
+      }
+      error->setjmp = 1;
+      av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
+      error->setjmp = 0;
     }
   }
   return res;
@@ -2569,6 +2600,17 @@
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_max_consec_frame_drop_cbr(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  AV1_PRIMARY *const ppi = ctx->ppi;
+  AV1_COMP *const cpi = ppi->cpi;
+  const int max_consec_drop = CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, args);
+  if (max_consec_drop < 0) return AOM_CODEC_INVALID_PARAM;
+  cpi->rc.max_consec_drop = max_consec_drop;
+  cpi->rc.drop_count_consec = 0;
+  return AOM_CODEC_OK;
+}
+
 #if !CONFIG_REALTIME_ONLY
 aom_codec_err_t av1_create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
                                         STATS_BUFFER_CTX *stats_buf_context,
@@ -2722,6 +2764,8 @@
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 
+      priv->monochrome_on_init = priv->cfg.monochrome;
+
       priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head,
                                                 *num_lap_buffers, &priv->oxcf);
       if (!priv->ppi) return AOM_CODEC_MEM_ERROR;
@@ -3144,7 +3188,8 @@
       const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data);
       if (status != -1) {
         if (status != AOM_CODEC_OK) {
-          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s",
+                             cpi->common.error->detail);
         }
       }
       av1_post_encode_updates(cpi_lap, &cpi_lap_data);
@@ -3157,12 +3202,6 @@
       ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
     }
 
-    // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH for
-    // real time encoding.
-    if (is_one_pass_rt_params(cpi) &&
-        cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
-      cpi->gf_frame_index = 0;
-
     // Get the next visible frame. Invisible frames get packed with the next
     // visible frame.
     while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
@@ -3200,7 +3239,8 @@
       }
       if (status == -1) break;
       if (status != AOM_CODEC_OK) {
-        aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+        aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s",
+                           cpi->common.error->detail);
       }
       if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) {
         av1_init_sc_decisions(ppi);
@@ -3557,7 +3597,12 @@
         lc->min_q = params->min_quantizers[layer];
         lc->scaling_factor_num = params->scaling_factor_num[sl];
         lc->scaling_factor_den = params->scaling_factor_den[sl];
-        lc->layer_target_bitrate = 1000 * params->layer_target_bitrate[layer];
+        const int layer_target_bitrate = params->layer_target_bitrate[layer];
+        if (layer_target_bitrate > INT_MAX / 1000) {
+          lc->layer_target_bitrate = INT_MAX;
+        } else {
+          lc->layer_target_bitrate = 1000 * layer_target_bitrate;
+        }
         lc->framerate_factor = params->framerate_factor[tl];
         if (tl == ppi->number_temporal_layers - 1)
           target_bandwidth += lc->layer_target_bitrate;
@@ -4344,6 +4389,7 @@
   { AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc },
   { AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
   { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr },
+  { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index a1e7558..29c63e2 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -144,6 +144,7 @@
 
   aom_free(ctx->frame_worker);
   aom_free(ctx->buffer_pool);
+  assert(!ctx->img.self_allocd);
   aom_img_free(&ctx->img);
   aom_free(ctx);
   return AOM_CODEC_OK;
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 6e95f70..5e6ffc9 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -288,11 +288,9 @@
                       cdef_info->allocated_mi_rows);
 }
 
-// Assumes cm->rst_info[p].restoration_unit_size is already initialized
+// Allocate buffers which are independent of restoration_unit_size
 void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
   const int num_planes = av1_num_planes(cm);
-  for (int p = 0; p < num_planes; ++p)
-    av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
 
   if (cm->rst_tmpbuf == NULL && is_sgr_enabled) {
     CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
@@ -303,21 +301,13 @@
     CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
   }
 
-  // For striped loop restoration, we divide each row of tiles into "stripes",
+  // For striped loop restoration, we divide each plane into "stripes",
   // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
   // luma pixels to match the output from CDEF. We will need to store 2 *
-  // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
-  // able to quickly answer the question "Where is the <n>'th stripe for tile
-  // row <m>?" To make that efficient, we generate the rst_last_stripe array.
-  int num_stripes = 0;
-  for (int i = 0; i < cm->tiles.rows; ++i) {
-    TileInfo tile_info;
-    av1_tile_set_row(&tile_info, cm, i);
-    const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
-    const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
-    const int tile_stripes = (ext_h + 63) / 64;
-    num_stripes += tile_stripes;
-  }
+  // RESTORATION_CTX_VERT lines of data for each stripe.
+  int mi_h = cm->mi_params.mi_rows;
+  const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+  const int num_stripes = (ext_h + 63) / 64;
 
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 8afcd1f..09e5166 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c
@@ -3606,7 +3606,7 @@
     identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
                              -shift[0]);
     for (int j = 0; j < buf_size_w_div8; ++j) {
-      transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+      transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
     }
     temp_b += 8;
   }
@@ -3665,14 +3665,14 @@
     if (lr_flip == 1) {
       for (int j = 0; j < buf_size_w_div8; ++j) {
         flip_buf_ud_neon(&cur_a[j * 8], 8);
-        transpose_s16_8x8q(
+        transpose_arrays_s16_8x8(
             &cur_a[j * 8],
             &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
       }
       temp_b += 8;
     } else {
       for (int j = 0; j < buf_size_w_div8; ++j) {
-        transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+        transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
       }
       temp_b += 8;
     }
@@ -3730,7 +3730,7 @@
     identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
                              -shift[0]);
     for (int j = 0; j < buf_size_w_div8; ++j) {
-      transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+      transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
     }
     temp_b += 8;
   }
@@ -3768,7 +3768,7 @@
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
-  int r, bd = 8;
+  int r;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
   const transform_1d_neon col_txfm =
@@ -3795,20 +3795,20 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    clamp_buf(temp_in, txfm_size_row, 16);
     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+            clip_pixel(output[r * stride + c] + temp_out[r]);
       }
     } else {
       // flip upside down
       for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+        output[r * stride + c] = clip_pixel(output[r * stride + c] +
+                                            temp_out[txfm_size_row - r - 1]);
       }
     }
   }
@@ -3832,7 +3832,7 @@
   int32_t *buf_ptr = buf;
   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
                                                    16, 16, 16, 16 };
-  int r, bd = 8;
+  int r;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
   const transform_1d_neon col_txfm =
@@ -3860,20 +3860,20 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    clamp_buf(temp_in, txfm_size_row, 16);
     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+            clip_pixel(output[r * stride + c] + temp_out[r]);
       }
     } else {
       // flip upside down
       for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+        output[r * stride + c] = clip_pixel(output[r * stride + c] +
+                                            temp_out[txfm_size_row - r - 1]);
       }
     }
   }
@@ -3897,7 +3897,7 @@
   int32_t *buf_ptr = buf;
   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
                                                    16, 16, 16, 16 };
-  int r, bd = 8;
+  int r;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
   const transform_1d_neon col_txfm =
@@ -3925,20 +3925,20 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    clamp_buf(temp_in, txfm_size_row, 16);
     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+            clip_pixel(output[r * stride + c] + temp_out[r]);
       }
     } else {
       // flip upside down
       for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+        output[r * stride + c] = clip_pixel(output[r * stride + c] +
+                                            temp_out[txfm_size_row - r - 1]);
       }
     }
   }
@@ -3962,7 +3962,7 @@
   int32_t *buf_ptr = buf;
   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
                                                    16, 16, 16, 16, 16 };
-  int r, bd = 8;
+  int r;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
   const transform_1d_neon col_txfm =
@@ -3989,20 +3989,20 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    clamp_buf(temp_in, txfm_size_row, 16);
     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+            clip_pixel(output[r * stride + c] + temp_out[r]);
       }
     } else {
       // flip upside down
       for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+        output[r * stride + c] = clip_pixel(output[r * stride + c] +
+                                            temp_out[txfm_size_row - r - 1]);
       }
     }
   }
@@ -4026,7 +4026,7 @@
   int32_t *buf_ptr = buf;
   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
                                                    16, 16, 16, 16, 16 };
-  int r, bd = 8;
+  int r;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
   const transform_1d_neon col_txfm =
@@ -4053,20 +4053,20 @@
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    clamp_buf(temp_in, txfm_size_row, 16);
     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+            clip_pixel(output[r * stride + c] + temp_out[r]);
       }
     } else {
       // flip upside down
       for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+        output[r * stride + c] = clip_pixel(output[r * stride + c] +
+                                            temp_out[txfm_size_row - r - 1]);
       }
     }
   }
@@ -4116,14 +4116,14 @@
     if (lr_flip == 1) {
       for (int j = 0; j < buf_size_w_div8; ++j) {
         flip_buf_ud_neon(&cur_a[j * 8], 8);
-        transpose_s16_8x8q(
+        transpose_arrays_s16_8x8(
             &cur_a[j * 8],
             &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
       }
       temp_b += 8;
     } else {
       for (int j = 0; j < buf_size_w_div8; ++j) {
-        transpose_s16_8x8q(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+        transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
       }
       temp_b += 8;
     }
diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index baad328..22d2977 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c
@@ -13,12 +13,12 @@
 #include <arm_neon.h>
 #include <assert.h>
 
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/blend_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
-#include "aom_ports/mem.h"
-#include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
                               const uint8_t *src0, uint32_t src0_stride,
@@ -31,94 +31,72 @@
   assert(w >= 2);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
-  uint8x8_t tmp0, tmp1;
-  uint8x16_t res_q;
-  uint16x8_t res, res_low, res_high;
-  const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
 
-  if (w >= 16) {
-    const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        __builtin_prefetch(src0);
-        __builtin_prefetch(src1);
-        const uint8x16_t tmp0_q = vld1q_u8(src0);
-        const uint8x16_t tmp1_q = vld1q_u8(src1);
-        const uint8x16_t m_q = vld1q_u8(mask);
-        const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
-        res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
-        res_low =
-            vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
-        res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
-        res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
-                            vget_high_u8(tmp1_q));
-        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
-                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
-        vst1q_u8(dst, res_q);
-        src0 += 16;
-        src1 += 16;
-        dst += 16;
-        mask += 16;
-      }
-      src0 += src0_stride - w;
-      src1 += src1_stride - w;
-      dst += dst_stride - w;
-      mask -= w;
-    }
-  } else if (w == 8) {
-    const uint8x8_t m = vld1_u8(mask);
-    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
-    for (int i = 0; i < h; ++i) {
-      __builtin_prefetch(src0);
-      __builtin_prefetch(src1);
-      tmp0 = vld1_u8(src0);
-      tmp1 = vld1_u8(src1);
-      res = vmull_u8(m, tmp0);
-      res = vmlal_u8(res, max_minus_m, tmp1);
-      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+  if (w > 8) {
+    do {
+      int i = 0;
+      do {
+        uint8x16_t m0 = vld1q_u8(mask + i);
+        uint8x16_t s0 = vld1q_u8(src0 + i);
+        uint8x16_t s1 = vld1q_u8(src1 + i);
+
+        uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+        vst1q_u8(dst + i, blend);
+
+        i += 16;
+      } while (i < w);
+
       src0 += src0_stride;
       src1 += src1_stride;
       dst += dst_stride;
-    }
+    } while (--h != 0);
+  } else if (w == 8) {
+    const uint8x8_t m0 = vld1_u8(mask);
+    do {
+      uint8x8_t s0 = vld1_u8(src0);
+      uint8x8_t s1 = vld1_u8(src1);
+
+      uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+      vst1_u8(dst, blend);
+
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
   } else if (w == 4) {
-    assert(((uintptr_t)mask & 3) == 0);
-    const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
-    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
-    for (int i = 0; i < h; i += 2) {
-      __builtin_prefetch(src0 + 0 * src0_stride);
-      __builtin_prefetch(src0 + 1 * src0_stride);
-      __builtin_prefetch(src1 + 0 * src1_stride);
-      __builtin_prefetch(src1 + 1 * src1_stride);
-      tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
-      tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
-      res = vmull_u8(m, tmp0);
-      res = vmlal_u8(res, max_minus_m, tmp1);
-      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
-      store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
-      store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
-      src0 += (2 * src0_stride);
-      src1 += (2 * src1_stride);
-      dst += (2 * dst_stride);
-    }
-  } else if (w == 2) {
-    assert(((uintptr_t)mask & 1) == 0);
-    const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
-    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
-    for (int i = 0; i < h; i += 2) {
-      __builtin_prefetch(src0 + 0 * src0_stride);
-      __builtin_prefetch(src0 + 1 * src0_stride);
-      __builtin_prefetch(src1 + 0 * src1_stride);
-      __builtin_prefetch(src1 + 1 * src1_stride);
-      tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
-      tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
-      res = vmull_u8(m, tmp0);
-      res = vmlal_u8(res, max_minus_m, tmp1);
-      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
-      store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
-      store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
-      src0 += (2 * src0_stride);
-      src1 += (2 * src1_stride);
-      dst += (2 * dst_stride);
-    }
+    const uint8x8_t m0 = load_unaligned_dup_u8_4x2(mask);
+    do {
+      uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+      uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+      uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+      store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 16) {
+    const uint8x8_t m0 = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+    do {
+      uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride);
+      uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride);
+
+      uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+      store_unaligned_u8_2x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_blend_a64_hmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, w, h);
   }
 }
diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index c316977..d53d363 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c
@@ -16,6 +16,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/blend_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
 #include "config/aom_dsp_rtcd.h"
@@ -24,9 +25,6 @@
                               const uint8_t *src0, uint32_t src0_stride,
                               const uint8_t *src1, uint32_t src1_stride,
                               const uint8_t *mask, int w, int h) {
-  uint8x8_t tmp0, tmp1;
-  uint8x16_t tmp0_q, tmp1_q, res_q;
-  uint16x8_t res, res_low, res_high;
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
@@ -35,95 +33,80 @@
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  if (w >= 16) {
-    for (int i = 0; i < h; ++i) {
-      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
-      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
-      for (int j = 0; j < w; j += 16) {
-        __builtin_prefetch(src0);
-        __builtin_prefetch(src1);
-        tmp0_q = vld1q_u8(src0);
-        tmp1_q = vld1q_u8(src1);
-        res_low = vmull_u8(m, vget_low_u8(tmp0_q));
-        res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
-        res_high = vmull_u8(m, vget_high_u8(tmp0_q));
-        res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
-        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
-                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
-        vst1q_u8(dst, res_q);
-        src0 += 16;
-        src1 += 16;
-        dst += 16;
-      }
-      src0 += src0_stride - w;
-      src1 += src1_stride - w;
-      dst += dst_stride - w;
-    }
-  } else if (w == 8) {
-    for (int i = 0; i < h; ++i) {
-      __builtin_prefetch(src0);
-      __builtin_prefetch(src1);
-      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
-      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
-      tmp0 = vld1_u8(src0);
-      tmp1 = vld1_u8(src1);
-      res = vmull_u8(m, tmp0);
-      res = vmlal_u8(res, max_minus_m, tmp1);
-      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+  if (w > 8) {
+    do {
+      uint8x16_t m0 = vdupq_n_u8(mask[0]);
+      int i = 0;
+      do {
+        uint8x16_t s0 = vld1q_u8(src0 + i);
+        uint8x16_t s1 = vld1q_u8(src1 + i);
+
+        uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+        vst1q_u8(dst + i, blend);
+
+        i += 16;
+      } while (i < w);
+
+      mask += 1;
       src0 += src0_stride;
       src1 += src1_stride;
       dst += dst_stride;
-    }
+    } while (--h != 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t m0 = vdup_n_u8(mask[0]);
+      uint8x8_t s0 = vld1_u8(src0);
+      uint8x8_t s1 = vld1_u8(src1);
+
+      uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+      vst1_u8(dst, blend);
+
+      mask += 1;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
   } else if (w == 4) {
-    for (int i = 0; i < h; i += 2) {
-      __builtin_prefetch(src0 + 0 * src0_stride);
-      __builtin_prefetch(src0 + 1 * src0_stride);
-      __builtin_prefetch(src1 + 0 * src1_stride);
-      __builtin_prefetch(src1 + 1 * src1_stride);
-      const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
-      const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
-      const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
-      const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
-      const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
-      const uint8x8_t max_minus_m =
-          vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
-      tmp0 = load_unaligned_u8_4x2(src0, src0_stride);
-      tmp1 = load_unaligned_u8_4x2(src1, src1_stride);
-      res = vmull_u8(m, tmp0);
-      res = vmlal_u8(res, max_minus_m, tmp1);
-      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
-      store_unaligned_u8_4x1(dst + 0 * dst_stride, result, 0);
-      store_unaligned_u8_4x1(dst + 1 * dst_stride, result, 1);
-      src0 += (2 * src0_stride);
-      src1 += (2 * src1_stride);
-      dst += (2 * dst_stride);
-    }
-  } else if (w == 2) {
-    for (int i = 0; i < h; i += 2) {
-      __builtin_prefetch(src0 + 0 * src0_stride);
-      __builtin_prefetch(src0 + 1 * src0_stride);
-      __builtin_prefetch(src1 + 0 * src1_stride);
-      __builtin_prefetch(src1 + 1 * src1_stride);
-      const uint8x8_t m1 = vdup_n_u8(mask[i]);
-      const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
-      const uint16x4x2_t m_trn =
-          vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
-      const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
-      const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
-      const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
-      const uint16x4x2_t max_minus_m_trn = vtrn_u16(
-          vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
-      const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
-      tmp0 = load_unaligned_u8_2x2(src0, src0_stride);
-      tmp1 = load_unaligned_u8_2x2(src1, src1_stride);
-      res = vmull_u8(m, tmp0);
-      res = vmlal_u8(res, max_minus_m, tmp1);
-      const uint8x8_t result = vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS);
-      store_unaligned_u8_2x1(dst + 0 * dst_stride, result, 0);
-      store_unaligned_u8_2x1(dst + 1 * dst_stride, result, 1);
-      src0 += (2 * src0_stride);
-      src1 += (2 * src1_stride);
-      dst += (2 * dst_stride);
-    }
+    do {
+      const uint16x4_t m0 = vdup_n_u16((uint16_t)mask[0]);
+      const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[1]);
+      const uint8x8_t m = vmovn_u16(vcombine_u16(m0, m1));
+      uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+      uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+      uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
+
+      store_unaligned_u8_4x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 16) {
+    do {
+      uint16x4_t m0 = vdup_n_u16(0);
+      m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+      uint8x8_t m =
+          vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+      uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride);
+      uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride);
+
+      uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
+
+      store_unaligned_u8_2x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, w, h);
   }
 }
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 4397a47..1bcf6a1 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -9,23 +9,408 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_neon
-#include "av1/common/cdef_block_simd.h"
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/cdef_block.h"
 
 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
                                         const uint8_t *src, int sstride,
                                         int width, int height) {
-  int j;
-  for (int i = 0; i < height; i++) {
-    for (j = 0; j < (width & ~0x7); j += 8) {
-      v64 row = v64_load_unaligned(&src[i * sstride + j]);
-      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+  do {
+    const uint8_t *src_ptr = src;
+    uint16_t *dst_ptr = dst;
+
+    int w = 0;
+    while (w <= width - 16) {
+      uint8x16_t row = vld1q_u8(src_ptr + w);
+      uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
+      vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
+
+      w += 16;
     }
-    for (; j < width; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
+    if (width - w == 8) {
+      uint8x8_t row = vld1_u8(src_ptr + w);
+      vst1q_u16(dst_ptr + w, vmovl_u8(row));
+    } else if (width - w == 4) {
+      for (int i = 0; i < 4; i++) {
+        dst_ptr[i] = src_ptr[i];
+      }
     }
+
+    src += sstride;
+    dst += dstride;
+  } while (--height != 0);
+}
+
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
+                                         const uint16_t *src, int sstride,
+                                         int width, int height) {
+  do {
+    const uint16_t *src_ptr = src;
+    uint16_t *dst_ptr = dst;
+
+    int w = 0;
+    while (width - w >= 8) {
+      uint16x8_t row = vld1q_u16(src_ptr + w);
+      vst1q_u16(dst_ptr + w, row);
+
+      w += 8;
+    }
+    if (width == 4) {
+      uint16x4_t row = vld1_u16(src_ptr);
+      vst1_u16(dst_ptr, row);
+    }
+
+    src += sstride;
+    dst += dstride;
+  } while (--height != 0);
+}
+
+static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) {
+  return vreinterpretq_s16_s64(vcombine_s64(vcreate_s64(a), vcreate_s64(b)));
+}
+
+#define SHL_HIGH_NEON(n)                                                       \
+  static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) {              \
+    int64x2_t a_s64 = vreinterpretq_s64_s16(a);                                \
+    return v128_from_64_neon(                                                  \
+        0, vget_lane_u64(vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \
+                                    (n - 8) * 8),                              \
+                         0));                                                  \
   }
+
+#define SHL_NEON(n)                                                      \
+  static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) {        \
+    int64x2_t a_s64 = vreinterpretq_s64_s16(a);                          \
+    return v128_from_64_neon(                                            \
+        0, vget_lane_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), 0)); \
+  }
+
+#define SHL_LOW_NEON(n)                                                        \
+  static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) {              \
+    int64x2_t a_s64 = vreinterpretq_s64_s16(a);                                \
+    return v128_from_64_neon(                                                  \
+        vget_lane_u64(                                                         \
+            vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), 0),  \
+        vget_lane_u64(                                                         \
+            vorr_u64(                                                          \
+                vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \
+                vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)),          \
+                           (8 - n) * 8)),                                      \
+            0));                                                               \
+  }
+
+SHL_HIGH_NEON(14)
+SHL_HIGH_NEON(12)
+SHL_HIGH_NEON(10)
+SHL_NEON(8)
+SHL_LOW_NEON(6)
+SHL_LOW_NEON(4)
+SHL_LOW_NEON(2)
+
+#define v128_shl_n_byte_neon(a, n) v128_shl_##n##_byte_neon(a)
+
+#define SHR_HIGH_NEON(n)                                                     \
+  static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) {            \
+    int64x2_t a_s64 = vreinterpretq_s64_s16(a);                              \
+    return v128_from_64_neon(                                                \
+        vget_lane_u64(vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \
+                                 (n - 8) * 8),                               \
+                      0),                                                    \
+        0);                                                                  \
+  }
+
+#define SHR_NEON(n)                                                       \
+  static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) {         \
+    int64x2_t a_s64 = vreinterpretq_s64_s16(a);                           \
+    return v128_from_64_neon(                                             \
+        vget_lane_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), 0), 0); \
+  }
+
+#define SHR_LOW_NEON(n)                                                       \
+  static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) {             \
+    int64x2_t a_s64 = vreinterpretq_s64_s16(a);                               \
+    return v128_from_64_neon(                                                 \
+        vget_lane_u64(                                                        \
+            vorr_u64(                                                         \
+                vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), \
+                vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)),        \
+                           (8 - n) * 8)),                                     \
+            0),                                                               \
+        vget_lane_u64(                                                        \
+            vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8),    \
+            0));                                                              \
+  }
+
+SHR_HIGH_NEON(14)
+SHR_HIGH_NEON(12)
+SHR_HIGH_NEON(10)
+SHR_NEON(8)
+SHR_LOW_NEON(6)
+SHR_LOW_NEON(4)
+SHR_LOW_NEON(2)
+
+#define v128_shr_n_byte_neon(a, n) v128_shr_##n##_byte_neon(a)
+
+static INLINE uint32x4_t v128_madd_s16_neon(int16x8_t a, int16x8_t b) {
+  uint32x4_t t1 =
+      vreinterpretq_u32_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b)));
+  uint32x4_t t2 =
+      vreinterpretq_u32_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b)));
+#if AOM_ARCH_AARCH64
+  return vpaddq_u32(t1, t2);
+#else
+  return vcombine_u32(vpadd_u32(vget_low_u32(t1), vget_high_u32(t1)),
+                      vpadd_u32(vget_low_u32(t2), vget_high_u32(t2)));
+#endif
+}
+
+// partial A is a 16-bit vector of the form:
+// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+// [0  y1 y2 y3 y4 y5 y6 y7].
+// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+// and const2.
+static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
+                                               int16x8_t partialb,
+                                               uint32x4_t const1,
+                                               uint32x4_t const2) {
+  int16x8_t tmp;
+  // Reverse partial B.
+  uint8x16_t pattern = vreinterpretq_u8_u64(
+      vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
+                   vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
+
+#if AOM_ARCH_AARCH64
+  partialb =
+      vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
+#else
+  int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
+                     vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
+  int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
+  int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
+  partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
+#endif
+
+  // Interleave the x and y values of identical indices and pair x8 with 0.
+  tmp = partiala;
+  partiala = vzipq_s16(partiala, partialb).val[0];
+  partialb = vzipq_s16(tmp, partialb).val[1];
+  // Square and add the corresponding x and y values.
+  uint32x4_t partiala_u32 = v128_madd_s16_neon(partiala, partiala);
+  uint32x4_t partialb_u32 = v128_madd_s16_neon(partialb, partialb);
+
+  // Multiply by constant.
+  partiala_u32 = vmulq_u32(partiala_u32, const1);
+  partialb_u32 = vmulq_u32(partialb_u32, const2);
+
+  // Sum all results.
+  partiala_u32 = vaddq_u32(partiala_u32, partialb_u32);
+  return partiala_u32;
+}
+
+static INLINE uint64x2_t ziplo_u64(uint32x4_t a, uint32x4_t b) {
+  return vcombine_u64(vget_low_u64(vreinterpretq_u64_u32(a)),
+                      vget_low_u64(vreinterpretq_u64_u32(b)));
+}
+
+static INLINE uint64x2_t ziphi_u64(uint32x4_t a, uint32x4_t b) {
+  return vcombine_u64(vget_high_u64(vreinterpretq_u64_u32(a)),
+                      vget_high_u64(vreinterpretq_u64_u32(b)));
+}
+
+static INLINE uint32x4_t hsum4_neon(uint32x4_t x0, uint32x4_t x1, uint32x4_t x2,
+                                    uint32x4_t x3) {
+  uint32x4_t t0, t1, t2, t3;
+  t0 = vzipq_u32(x0, x1).val[0];
+  t1 = vzipq_u32(x2, x3).val[0];
+  t2 = vzipq_u32(x0, x1).val[1];
+  t3 = vzipq_u32(x2, x3).val[1];
+  x0 = vreinterpretq_u32_u64(ziplo_u64(t0, t1));
+  x1 = vreinterpretq_u32_u64(ziphi_u64(t0, t1));
+  x2 = vreinterpretq_u32_u64(ziplo_u64(t2, t3));
+  x3 = vreinterpretq_u32_u64(ziphi_u64(t2, t3));
+  return vaddq_u32(vaddq_u32(x0, x1), vaddq_u32(x2, x3));
+}
+
+static INLINE uint32x4_t compute_directions_neon(int16x8_t lines[8],
+                                                 uint32_t cost[4]) {
+  int16x8_t partial4a, partial4b, partial5a, partial5b, partial6, partial7a,
+      partial7b;
+  int16x8_t tmp;
+
+  // Partial sums for lines 0 and 1.
+  partial4a = v128_shl_n_byte_neon(lines[0], 14);
+  partial4b = v128_shr_n_byte_neon(lines[0], 2);
+  partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[1], 12));
+  partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[1], 4));
+  tmp = vaddq_s16(lines[0], lines[1]);
+  partial5a = v128_shl_n_byte_neon(tmp, 10);
+  partial5b = v128_shr_n_byte_neon(tmp, 6);
+  partial7a = v128_shl_n_byte_neon(tmp, 4);
+  partial7b = v128_shr_n_byte_neon(tmp, 12);
+  partial6 = tmp;
+
+  // Partial sums for lines 2 and 3.
+  partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[2], 10));
+  partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[2], 6));
+  partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[3], 8));
+  partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[3], 8));
+  tmp = vaddq_s16(lines[2], lines[3]);
+  partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 8));
+  partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 8));
+  partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 6));
+  partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 10));
+  partial6 = vaddq_s16(partial6, tmp);
+
+  // Partial sums for lines 4 and 5.
+  partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[4], 6));
+  partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[4], 10));
+  partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[5], 4));
+  partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[5], 12));
+  tmp = vaddq_s16(lines[4], lines[5]);
+  partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 6));
+  partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 10));
+  partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 8));
+  partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 8));
+  partial6 = vaddq_s16(partial6, tmp);
+
+  // Partial sums for lines 6 and 7.
+  partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[6], 2));
+  partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[6], 14));
+  partial4a = vaddq_s16(partial4a, lines[7]);
+  tmp = vaddq_s16(lines[6], lines[7]);
+  partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 4));
+  partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 12));
+  partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 10));
+  partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 6));
+  partial6 = vaddq_s16(partial6, tmp);
+
+  uint32x4_t const0 = vreinterpretq_u32_u64(
+      vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
+                   vcreate_u64((uint64_t)210 << 32 | 280)));
+  uint32x4_t const1 = vreinterpretq_u32_u64(
+      vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
+                   vcreate_u64((uint64_t)105 << 32 | 120)));
+  uint32x4_t const2 = vreinterpretq_u32_u64(
+      vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
+  uint32x4_t const3 = vreinterpretq_u32_u64(
+      vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
+                   vcreate_u64((uint64_t)105 << 32 | 105)));
+
+  // Compute costs in terms of partial sums.
+  uint32x4_t partial4a_u32 =
+      fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
+  uint32x4_t partial7a_u32 =
+      fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
+  uint32x4_t partial5a_u32 =
+      fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
+  uint32x4_t partial6_u32 = v128_madd_s16_neon(partial6, partial6);
+  partial6_u32 = vmulq_u32(partial6_u32, vdupq_n_u32(105));
+
+  partial4a_u32 =
+      hsum4_neon(partial4a_u32, partial5a_u32, partial6_u32, partial7a_u32);
+  vst1q_u32(cost, partial4a_u32);
+  return partial4a_u32;
+}
+
+static INLINE int64x2_t ziplo_s64(int32x4_t a, int32x4_t b) {
+  return vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(a)),
+                      vget_low_s64(vreinterpretq_s64_s32(b)));
+}
+
+static INLINE int64x2_t ziphi_s64(int32x4_t a, int32x4_t b) {
+  return vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(a)),
+                      vget_high_s64(vreinterpretq_s64_s32(b)));
+}
+
+// Transpose and reverse the order of the lines -- equivalent to a 90-degree
+// counter-clockwise rotation of the pixels.
+static INLINE void array_reverse_transpose_8x8_neon(int16x8_t *in,
+                                                    int16x8_t *res) {
+  const int32x4_t tr0_0 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[0]);
+  const int32x4_t tr0_1 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[0]);
+  const int32x4_t tr0_2 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[1]);
+  const int32x4_t tr0_3 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[1]);
+  const int32x4_t tr0_4 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[0]);
+  const int32x4_t tr0_5 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[0]);
+  const int32x4_t tr0_6 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[1]);
+  const int32x4_t tr0_7 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[1]);
+
+  const int32x4_t tr1_0 = vzipq_s32(tr0_0, tr0_1).val[0];
+  const int32x4_t tr1_1 = vzipq_s32(tr0_4, tr0_5).val[0];
+  const int32x4_t tr1_2 = vzipq_s32(tr0_0, tr0_1).val[1];
+  const int32x4_t tr1_3 = vzipq_s32(tr0_4, tr0_5).val[1];
+  const int32x4_t tr1_4 = vzipq_s32(tr0_2, tr0_3).val[0];
+  const int32x4_t tr1_5 = vzipq_s32(tr0_6, tr0_7).val[0];
+  const int32x4_t tr1_6 = vzipq_s32(tr0_2, tr0_3).val[1];
+  const int32x4_t tr1_7 = vzipq_s32(tr0_6, tr0_7).val[1];
+
+  res[7] = vreinterpretq_s16_s64(ziplo_s64(tr1_0, tr1_1));
+  res[6] = vreinterpretq_s16_s64(ziphi_s64(tr1_0, tr1_1));
+  res[5] = vreinterpretq_s16_s64(ziplo_s64(tr1_2, tr1_3));
+  res[4] = vreinterpretq_s16_s64(ziphi_s64(tr1_2, tr1_3));
+  res[3] = vreinterpretq_s16_s64(ziplo_s64(tr1_4, tr1_5));
+  res[2] = vreinterpretq_s16_s64(ziphi_s64(tr1_4, tr1_5));
+  res[1] = vreinterpretq_s16_s64(ziplo_s64(tr1_6, tr1_7));
+  res[0] = vreinterpretq_s16_s64(ziphi_s64(tr1_6, tr1_7));
+}
+
+static INLINE uint32_t compute_best_dir(uint8x16_t a) {
+  uint8x16_t idx =
+      vandq_u8(a, vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+#if AOM_ARCH_AARCH64
+  return vaddv_u8(vget_low_u8(idx)) + (vaddv_u8(vget_high_u8(idx)) << 8);
+#else
+  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(idx)));
+  uint8x16_t s = vreinterpretq_u8_u64(m);
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vzip_u8(vget_low_u8(s), vget_high_u8(s)).val[0]), 0);
+#endif
+}
+
+int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
+                       int coeff_shift) {
+  uint32_t cost[8];
+  uint32_t best_cost = 0;
+  int best_dir = 0;
+  int16x8_t lines[8];
+  for (int i = 0; i < 8; i++) {
+    uint16x8_t s = vld1q_u16(&img[i * stride]);
+    lines[i] = vreinterpretq_s16_u16(
+        vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
+  }
+
+  // Compute "mostly vertical" directions.
+  uint32x4_t cost47 = compute_directions_neon(lines, cost + 4);
+
+  array_reverse_transpose_8x8_neon(lines, lines);
+
+  // Compute "mostly horizontal" directions.
+  uint32x4_t cost03 = compute_directions_neon(lines, cost);
+
+  uint32x4_t max_cost = vmaxq_u32(cost03, cost47);
+  max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 2));
+  max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 1));
+  best_cost = vgetq_lane_u32(max_cost, 0);
+  uint16x8_t idx = vcombine_u16(vqmovn_u32(vceqq_u32(max_cost, cost03)),
+                                vqmovn_u32(vceqq_u32(max_cost, cost47)));
+  uint8x16_t idx_u8 = vcombine_u8(vqmovn_u16(idx), vqmovn_u16(idx));
+  best_dir = compute_best_dir(idx_u8);
+  best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
+
+  // Difference between the optimal variance and the variance along the
+  // orthogonal direction. Again, the sum(x^2) terms cancel out.
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  // We'd normally divide by 840, but dividing by 1024 is close enough
+  // for what we're going to do with this.
+  *var >>= 10;
+  return best_dir;
 }
 
 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
@@ -38,3 +423,532 @@
   // Process second 8x8.
   *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
 }
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
+                                    unsigned int threshold, int adjdamp) {
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, b));
+  const int16x8_t sign = vshrq_n_s16(diff, 15);
+  diff = vabsq_s16(diff);
+  const uint16x8_t s =
+      vqsubq_u16(vdupq_n_u16(threshold),
+                 vreinterpretq_u16_s16(vshlq_s16(diff, vdupq_n_s16(-adjdamp))));
+  return veorq_s16(vaddq_s16(sign, vminq_s16(diff, vreinterpretq_s16_u16(s))),
+                   sign);
+}
+
+static INLINE uint16x8_t get_max_primary(const int is_lowbd, uint16x8_t *tap,
+                                         uint16x8_t max,
+                                         uint16x8_t cdef_large_value_mask) {
+  if (is_lowbd) {
+    uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]);
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3]));
+    /* The source is 16 bits, however, we only really care about the lower
+    8 bits.  The upper 8 bits contain the "large" flag.  After the final
+    primary max has been calculated, zero out the upper 8 bits.  Use this
+    to find the "16 bit" max. */
+    max = vmaxq_u16(
+        max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask));
+  } else {
+    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+    max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask));
+  }
+  return max;
+}
+
+static INLINE uint16x8_t get_max_secondary(const int is_lowbd, uint16x8_t *tap,
+                                           uint16x8_t max,
+                                           uint16x8_t cdef_large_value_mask) {
+  if (is_lowbd) {
+    uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]);
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[4]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[5]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[6]));
+    max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[7]));
+    /* The source is 16 bits, however, we only really care about the lower
+    8 bits.  The upper 8 bits contain the "large" flag.  After the final
+    primary max has been calculated, zero out the upper 8 bits.  Use this
+    to find the "16 bit" max. */
+    max = vmaxq_u16(
+        max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask));
+  } else {
+    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+    max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[4], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[5], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[6], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(tap[7], cdef_large_value_mask));
+  }
+  return max;
+}
+
+static INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
+                                    const uint16_t *in, int pri_strength,
+                                    int sec_strength, int dir, int pri_damping,
+                                    int sec_damping, int coeff_shift,
+                                    int height, int enable_primary,
+                                    int enable_secondary) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+  const int clipping_required = enable_primary && enable_secondary;
+  uint16x8_t max, min;
+  const uint16x8_t cdef_large_value_mask =
+      vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+
+  if (enable_primary && pri_strength) {
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  }
+  if (enable_secondary && sec_strength) {
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  }
+
+  int h = height;
+  do {
+    int16x8_t sum = vdupq_n_s16(0);
+    uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+    max = min = s;
+
+    if (enable_primary) {
+      uint16x8_t tap[4];
+
+      // Primary near taps
+      tap[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
+      tap[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
+      int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping);
+      int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping);
+
+      // sum += pri_taps[0] * (p0 + p1)
+      p0 = vaddq_s16(p0, p1);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0]));
+
+      // Primary far taps
+      tap[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
+      tap[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
+      p0 = constrain16(tap[2], s, pri_strength, pri_damping);
+      p1 = constrain16(tap[3], s, pri_strength, pri_damping);
+
+      // sum += pri_taps[1] * (p0 + p1)
+      p0 = vaddq_s16(p0, p1);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1]));
+
+      if (clipping_required) {
+        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = vminq_u16(min, tap[0]);
+        min = vminq_u16(min, tap[1]);
+        min = vminq_u16(min, tap[2]);
+        min = vminq_u16(min, tap[3]);
+      }
+    }
+
+    if (enable_secondary) {
+      uint16x8_t tap[8];
+
+      // Secondary near taps
+      tap[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
+      tap[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
+      tap[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
+      tap[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
+      int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping);
+      int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping);
+      int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping);
+      int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping);
+
+      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+      p0 = vaddq_s16(p0, p1);
+      p2 = vaddq_s16(p2, p3);
+      p0 = vaddq_s16(p0, p2);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0]));
+
+      // Secondary far taps
+      tap[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
+      tap[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
+      tap[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
+      tap[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
+      p0 = constrain16(tap[4], s, sec_strength, sec_damping);
+      p1 = constrain16(tap[5], s, sec_strength, sec_damping);
+      p2 = constrain16(tap[6], s, sec_strength, sec_damping);
+      p3 = constrain16(tap[7], s, sec_strength, sec_damping);
+
+      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+      p0 = vaddq_s16(p0, p1);
+      p2 = vaddq_s16(p2, p3);
+      p0 = vaddq_s16(p0, p2);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1]));
+
+      if (clipping_required) {
+        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = vminq_u16(min, tap[0]);
+        min = vminq_u16(min, tap[1]);
+        min = vminq_u16(min, tap[2]);
+        min = vminq_u16(min, tap[3]);
+        min = vminq_u16(min, tap[4]);
+        min = vminq_u16(min, tap[5]);
+        min = vminq_u16(min, tap[6]);
+        min = vminq_u16(min, tap[7]);
+      }
+    }
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+    int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8));
+    res = vshrq_n_s16(res, 4);
+    res = vaddq_s16(vreinterpretq_s16_u16(s), res);
+
+    if (clipping_required) {
+      res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
+                      vreinterpretq_s16_u16(max));
+    }
+
+    if (is_lowbd) {
+      const uint8x8_t res_128 = vqmovun_s16(res);
+      store_unaligned_u8_4x2(dst8, dstride, res_128);
+    } else {
+      store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res));
+    }
+
+    in += 2 * CDEF_BSTRIDE;
+    dst8 += 2 * dstride;
+    dst16 += 2 * dstride;
+    h -= 2;
+  } while (h != 0);
+}
+
+static INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
+                                    const uint16_t *in, int pri_strength,
+                                    int sec_strength, int dir, int pri_damping,
+                                    int sec_damping, int coeff_shift,
+                                    int height, int enable_primary,
+                                    int enable_secondary) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+  const int clipping_required = enable_primary && enable_secondary;
+  uint16x8_t max, min;
+  const uint16x8_t cdef_large_value_mask =
+      vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+
+  if (enable_primary && pri_strength) {
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  }
+  if (enable_secondary && sec_strength) {
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  }
+
+  int h = height;
+  do {
+    int16x8_t sum = vdupq_n_s16(0);
+    uint16x8_t s = vld1q_u16(in);
+    max = min = s;
+
+    if (enable_primary) {
+      uint16x8_t tap[4];
+
+      // Primary near taps
+      tap[0] = vld1q_u16(in + po1);
+      tap[1] = vld1q_u16(in - po1);
+      int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping);
+      int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping);
+
+      // sum += pri_taps[0] * (p0 + p1)
+      p0 = vaddq_s16(p0, p1);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0]));
+
+      // Primary far taps
+      tap[2] = vld1q_u16(in + po2);
+      p0 = constrain16(tap[2], s, pri_strength, pri_damping);
+      tap[3] = vld1q_u16(in - po2);
+      p1 = constrain16(tap[3], s, pri_strength, pri_damping);
+
+      // sum += pri_taps[1] * (p0 + p1)
+      p0 = vaddq_s16(p0, p1);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1]));
+      if (clipping_required) {
+        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = vminq_u16(min, tap[0]);
+        min = vminq_u16(min, tap[1]);
+        min = vminq_u16(min, tap[2]);
+        min = vminq_u16(min, tap[3]);
+      }
+    }
+
+    if (enable_secondary) {
+      uint16x8_t tap[8];
+
+      // Secondary near taps
+      tap[0] = vld1q_u16(in + s1o1);
+      tap[1] = vld1q_u16(in - s1o1);
+      tap[2] = vld1q_u16(in + s2o1);
+      tap[3] = vld1q_u16(in - s2o1);
+      int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping);
+      int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping);
+      int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping);
+      int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping);
+
+      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+      p0 = vaddq_s16(p0, p1);
+      p2 = vaddq_s16(p2, p3);
+      p0 = vaddq_s16(p0, p2);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0]));
+
+      // Secondary far taps
+      tap[4] = vld1q_u16(in + s1o2);
+      tap[5] = vld1q_u16(in - s1o2);
+      tap[6] = vld1q_u16(in + s2o2);
+      tap[7] = vld1q_u16(in - s2o2);
+      p0 = constrain16(tap[4], s, sec_strength, sec_damping);
+      p1 = constrain16(tap[5], s, sec_strength, sec_damping);
+      p2 = constrain16(tap[6], s, sec_strength, sec_damping);
+      p3 = constrain16(tap[7], s, sec_strength, sec_damping);
+
+      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+      p0 = vaddq_s16(p0, p1);
+      p2 = vaddq_s16(p2, p3);
+      p0 = vaddq_s16(p0, p2);
+      sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1]));
+
+      if (clipping_required) {
+        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = vminq_u16(min, tap[0]);
+        min = vminq_u16(min, tap[1]);
+        min = vminq_u16(min, tap[2]);
+        min = vminq_u16(min, tap[3]);
+        min = vminq_u16(min, tap[4]);
+        min = vminq_u16(min, tap[5]);
+        min = vminq_u16(min, tap[6]);
+        min = vminq_u16(min, tap[7]);
+      }
+    }
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+    int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8));
+    res = vshrq_n_s16(res, 4);
+    res = vaddq_s16(vreinterpretq_s16_u16(s), res);
+    if (clipping_required) {
+      res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
+                      vreinterpretq_s16_u16(max));
+    }
+
+    if (is_lowbd) {
+      const uint8x8_t res_128 = vqmovun_s16(res);
+      vst1_u8(dst8, res_128);
+    } else {
+      vst1q_u16(dst16, vreinterpretq_u16_s16(res));
+    }
+
+    in += CDEF_BSTRIDE;
+    dst8 += dstride;
+    dst16 += dstride;
+  } while (--h != 0);
+}
+
+static INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride,
+                                  const uint16_t *in, int height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+
+  int h = height;
+  do {
+    const uint16x8_t row = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+    if (is_lowbd) {
+      const uint8x8_t res_128 = vqmovn_u16(row);
+      store_unaligned_u8_4x2(dst8, dstride, res_128);
+    } else {
+      store_unaligned_u16_4x2(dst16, dstride, row);
+    }
+
+    in += 2 * CDEF_BSTRIDE;
+    dst8 += 2 * dstride;
+    dst16 += 2 * dstride;
+    h -= 2;
+  } while (h != 0);
+}
+
+static INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride,
+                                  const uint16_t *in, int height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+
+  int h = height;
+  do {
+    const uint16x8_t row = vld1q_u16(in);
+    if (is_lowbd) {
+      const uint8x8_t res_128 = vqmovn_u16(row);
+      vst1_u8(dst8, res_128);
+    } else {
+      vst1q_u16(dst16, row);
+    }
+
+    in += CDEF_BSTRIDE;
+    dst8 += dstride;
+    dst16 += dstride;
+  } while (--h != 0);
+}
+
+void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  }
+}
+
+void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  (void)pri_strength;
+  (void)sec_strength;
+  (void)dir;
+  (void)pri_damping;
+  (void)sec_damping;
+  (void)coeff_shift;
+  (void)block_width;
+  if (block_width == 8) {
+    copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
+  } else {
+    copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
+  }
+}
+
+void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
+                           int pri_strength, int sec_strength, int dir,
+                           int pri_damping, int sec_damping, int coeff_shift,
+                           int block_width, int block_height) {
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
+                           int pri_strength, int sec_strength, int dir,
+                           int pri_damping, int sec_damping, int coeff_shift,
+                           int block_width, int block_height) {
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  }
+}
+
+void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
+                           int pri_strength, int sec_strength, int dir,
+                           int pri_damping, int sec_damping, int coeff_shift,
+                           int block_width, int block_height) {
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
+                           int pri_strength, int sec_strength, int dir,
+                           int pri_damping, int sec_damping, int coeff_shift,
+                           int block_width, int block_height) {
+  (void)pri_strength;
+  (void)sec_strength;
+  (void)dir;
+  (void)pri_damping;
+  (void)sec_damping;
+  (void)coeff_shift;
+  (void)block_width;
+  if (block_width == 8) {
+    copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
+  } else {
+    copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
+  }
+}
diff --git a/av1/common/arm/compound_convolve_neon.c b/av1/common/arm/compound_convolve_neon.c
new file mode 100644
index 0000000..2e6af68
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon.c
@@ -0,0 +1,2731 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t x_filter,
+                                         const int16x4_t horiz_const) {
+  int16x4_t sum = horiz_const;
+  sum = vmla_lane_s16(sum, s0, x_filter, 0);
+  sum = vmla_lane_s16(sum, s1, x_filter, 1);
+  sum = vmla_lane_s16(sum, s2, x_filter, 2);
+  sum = vmla_lane_s16(sum, s3, x_filter, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshr_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t s6, const int16x8_t s7,
+                                         const int16x8_t x_filter,
+                                         const int16x8_t horiz_const) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
+  sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
+  sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
+  sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
+  sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
+  sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
+  sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
+  sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16_t *x_filter_ptr, const int im_h, int w) {
+  const int bd = 8;
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int height = im_h;
+
+  if (w == 4) {
+    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    // (The extra -1 is needed because we halved the filter values.)
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
+                                             (1 << ((ROUND0_BITS - 1) - 1)));
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      __builtin_prefetch(dst_ptr);
+
+      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+
+      int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
+
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  } else {
+    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    // (The extra -1 is needed because we halved the filter values.)
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+                                              (1 << ((ROUND0_BITS - 1) - 1)));
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+    do {
+      const uint8_t *s;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src_ptr + 7;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        x_filter, horiz_const);
+        int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        x_filter, horiz_const);
+        int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        x_filter, horiz_const);
+        int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        x_filter, horiz_const);
+        int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+                                        x_filter, horiz_const);
+        int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+                                        x_filter, horiz_const);
+        int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+                                        x_filter, horiz_const);
+        int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+                                        x_filter, horiz_const);
+
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      height -= 8;
+    } while (height > 8);
+#endif  // AOM_ARCH_AARCH64
+
+    do {
+      const uint8_t *s;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      uint8x8_t t0 = vld1_u8(src_ptr);
+      int16x8_t s0 =
+          vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      s = src_ptr + 8;
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        x_filter, horiz_const);
+        vst1q_s16(d, d0);
+
+        s0 = s8;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+  dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                                  x_filter_ptr, im_h, w);
+
+  if (clamped_y_taps == 6) {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+            w);
+      } else {
+        dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+                                                dst8_stride, conv_params,
+                                                y_filter, h, w);
+      }
+    } else {
+      dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+                                          y_filter, h, w);
+    }
+  } else {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+            w);
+      } else {
+        dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+                                                dst8_stride, conv_params,
+                                                y_filter, h, w);
+      }
+    } else {
+      dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+                                          y_filter, h, w);
+    }
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0, s1, s2, s3;
+      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+      uint16x4_t d1 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+      uint16x4_t d2 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+      uint16x4_t d3 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01, d23;
+      compute_dist_wtd_avg_4x4(
+          dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
+          vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
+
+      store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
+      store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
+      store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
+      store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      CONV_BUF_TYPE *d = dst;
+      uint8_t *d_u8 = dst8;
+      int width = w;
+
+      do {
+        uint8x8_t s0, s1, s2, s3;
+        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+        uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+        uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+        uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset,
+                                 vreinterpretq_s16_u16(round_offset_vec),
+                                 &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0, s1, s2, s3;
+      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+      uint16x4_t d1 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+      uint16x4_t d2 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+      uint16x4_t d3 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01, d23;
+      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                            vreinterpretq_s16_u16(round_offset_vec), &d01,
+                            &d23);
+
+      store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
+      store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
+      store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
+      store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      CONV_BUF_TYPE *d = dst;
+      uint8_t *d_u8 = dst8;
+      int width = w;
+
+      do {
+        uint8x8_t s0, s1, s2, s3;
+        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+        uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+        uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+        uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
+                              &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      dst8 += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
+                                                  int src_stride, int w, int h,
+                                                  ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0, s1, s2, s3;
+      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+      uint16x4_t d1 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+      uint16x4_t d2 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+      uint16x4_t d3 =
+          vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      CONV_BUF_TYPE *d = dst;
+      int width = w;
+
+      do {
+        uint8x8_t s0, s1, s2, s3;
+        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+        uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+        uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+        uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+                                        uint8_t *dst8, int dst8_stride, int w,
+                                        int h, ConvolveParams *conv_params) {
+  if (conv_params->do_average) {
+    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+      dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+          src, src_stride, dst8, dst8_stride, w, h, conv_params);
+    } else {
+      dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
+                                         h, conv_params);
+    }
+  } else {
+    dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
+  }
+}
+
+static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+                                       const int16x4_t s2, const int16x4_t s3,
+                                       const int16x4_t x_filter,
+                                       const int16x4_t round_offset) {
+  int16x4_t sum = vmul_lane_s16(s0, x_filter, 0);
+  sum = vmla_lane_s16(sum, s1, x_filter, 1);
+  sum = vmla_lane_s16(sum, s2, x_filter, 2);
+  sum = vmla_lane_s16(sum, s3, x_filter, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+  return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+                                       const int16x8_t s2, const int16x8_t s3,
+                                       const int16x8_t s4, const int16x8_t s5,
+                                       const int16x8_t s6, const int16x8_t s7,
+                                       const int16x8_t x_filter,
+                                       const int16x8_t round_offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
+  sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
+  sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
+  sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
+  sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
+  sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
+  sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
+  sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+  return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  uint8_t *dst8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      __builtin_prefetch(dst_ptr);
+      __builtin_prefetch(dst8_ptr);
+
+      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+
+      uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+                                    vget_low_s16(round_offset_vec));
+
+      uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+      uint8x8_t d01;
+      compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+                               vget_low_s16(round_offset_vec), &d01);
+
+      store_u8_4x1(dst8_ptr, d01, 0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst8_ptr += dst8_stride;
+    } while (--height != 0);
+  } else {
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+    while (height >= 8) {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      __builtin_prefetch(d + 4 * dst_stride);
+      __builtin_prefetch(d + 5 * dst_stride);
+      __builtin_prefetch(d + 6 * dst_stride);
+      __builtin_prefetch(d + 7 * dst_stride);
+
+      s += 7;
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      x_filter, round_offset_vec);
+
+        transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        uint16x8_t dd4, dd5, dd6, dd7;
+        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+                                 &d6_u8, &d7_u8);
+
+        store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
+                     d7_u8);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      dst8_ptr += 8 * dst8_stride;
+      height -= 8;
+    }
+#endif  // AOM_ARCH_AARCH64
+
+    while (height > 0) {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      __builtin_prefetch(d);
+
+      s += 8;
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      round_offset_vec);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+                                 round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+
+        s0 = s8;
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst8_ptr += dst8_stride;
+      height--;
+    }
+  }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  uint8_t *dst8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      __builtin_prefetch(dst_ptr);
+      __builtin_prefetch(dst8_ptr);
+
+      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+
+      uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+                                    vget_low_s16(round_offset_vec));
+
+      uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+      uint8x8_t d01;
+      compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+      store_u8_4x1(dst8_ptr, d01, 0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst8_ptr += dst8_stride;
+    } while (--height != 0);
+  } else {
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+    while (height >= 8) {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      __builtin_prefetch(d + 4 * dst_stride);
+      __builtin_prefetch(d + 5 * dst_stride);
+      __builtin_prefetch(d + 6 * dst_stride);
+      __builtin_prefetch(d + 7 * dst_stride);
+
+      s += 7;
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      x_filter, round_offset_vec);
+
+        transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        uint16x8_t dd4, dd5, dd6, dd7;
+        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+        store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
+                     d7_u8);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      dst8_ptr += 8 * dst8_stride;
+      height -= 8;
+    }
+#endif  // AOM_ARCH_AARCH64
+
+    while (height > 0) {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      __builtin_prefetch(d);
+
+      s += 8;
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      round_offset_vec);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+
+        s0 = s8;
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst8_ptr += dst8_stride;
+      height--;
+    }
+  }
+}
+
+static INLINE void dist_wtd_convolve_x_neon(
+    const uint8_t *src, int src_stride, int w, int h,
+    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      __builtin_prefetch(dst_ptr);
+
+      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+
+      uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+                                    vget_low_s16(round_offset_vec));
+
+      vst1_u16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  } else {
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+    while (height >= 8) {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      __builtin_prefetch(d + 4 * dst_stride);
+      __builtin_prefetch(d + 5 * dst_stride);
+      __builtin_prefetch(d + 6 * dst_stride);
+      __builtin_prefetch(d + 7 * dst_stride);
+
+      s += 7;
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                      round_offset_vec);
+        uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      x_filter, round_offset_vec);
+        uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      x_filter, round_offset_vec);
+
+        transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      height -= 8;
+    }
+#endif  // AOM_ARCH_AARCH64
+
+    while (height > 0) {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int width = w;
+
+      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      __builtin_prefetch(d);
+
+      s = src_ptr + 8;
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                      round_offset_vec);
+
+        vst1q_u16(d, d0);
+
+        s0 = s8;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      height--;
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const int subpel_x_qn,
+                                  ConvolveParams *conv_params) {
+  if (conv_params->do_average) {
+    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+      dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
+                                            w, h, filter_params_x, subpel_x_qn,
+                                            conv_params);
+    } else {
+      dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
+                                   filter_params_x, subpel_x_qn, conv_params);
+    }
+  } else {
+    dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
+                             subpel_x_qn, conv_params);
+  }
+}
+
+static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+                                       const int16x4_t s2, const int16x4_t s3,
+                                       const int16x4_t s4, const int16x4_t s5,
+                                       const int16x8_t y_filter,
+                                       const int16x4_t round_offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  // Filter values at indices 0 and 7 are 0.
+  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
+  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+  return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+                                       const int16x8_t s2, const int16x8_t s3,
+                                       const int16x8_t s4, const int16x8_t s5,
+                                       const int16x8_t y_filter,
+                                       const int16x8_t round_offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  // Filter values at indices 0 and 7 are 0.
+  int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
+  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+  return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+    const int dst8_stride, int w, int h, const int16x8_t y_filter,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int width = w;
+
+  if (w == 4 || h == 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+        int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        uint16x4_t dd0, dd1, dd2, dd3;
+        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d01, d23;
+        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d01, &d23);
+
+        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        d_u8 += 4 * dst8_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s);
+        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        uint16x4_t dd0 = vld1_u16(d);
+
+        uint8x8_t d01;
+        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+                                 vget_low_s16(round_offset_vec), &d01);
+
+        store_u8_4x1(d_u8, d01, 0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        d_u8 += dst8_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst8_ptr += 4;
+      width -= 4;
+    } while (width != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr + (5 * src_stride);
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      uint8x8_t t0, t1, t2, t3, t4;
+      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint16x8_t d0 =
+            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+        uint16x8_t d1 =
+            convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+        uint16x8_t d2 =
+            convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+        uint16x8_t d3 =
+            convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+        uint16x8_t d4 =
+            convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+        uint16x8_t d5 =
+            convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+        uint16x8_t d6 =
+            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+        uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+                                      round_offset_vec);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        uint16x8_t dd4, dd5, dd6, dd7;
+        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+                                 &d6_u8, &d7_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        uint16x8_t d0 =
+            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+                                 round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
+    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+    const int dst8_stride, int w, int h, const int16x8_t y_filter,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int width = w;
+
+  if (w == 4 || h == 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+        int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        uint16x4_t dd0, dd1, dd2, dd3;
+        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d01, d23;
+        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d01, &d23);
+
+        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        d_u8 += 4 * dst8_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s);
+        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        uint16x4_t dd0 = vld1_u16(d);
+
+        uint8x8_t d01;
+        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+        store_u8_4x1(d_u8, d01, 0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        d_u8 += dst8_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst8_ptr += 4;
+      width -= 4;
+    } while (width != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr + (5 * src_stride);
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      uint8x8_t t0, t1, t2, t3, t4;
+      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint16x8_t d0 =
+            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+        uint16x8_t d1 =
+            convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+        uint16x8_t d2 =
+            convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+        uint16x8_t d3 =
+            convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+        uint16x8_t d4 =
+            convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+        uint16x8_t d5 =
+            convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+        uint16x8_t d6 =
+            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+        uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+                                      round_offset_vec);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        uint16x8_t dd4, dd5, dd6, dd7;
+        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        uint16x8_t d0 =
+            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
+                                                 int src_stride, int w, int h,
+                                                 const int16x8_t y_filter,
+                                                 ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int width = w;
+
+  if (w == 4 || h == 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int height = h;
+
+      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+        int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s);
+        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        vst1_u16(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      width -= 4;
+    } while (width != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr + (5 * src_stride);
+      CONV_BUF_TYPE *d = dst_ptr;
+      int height = h;
+
+      uint8x8_t t0, t1, t2, t3, t4;
+      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint16x8_t d0 =
+            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+        uint16x8_t d1 =
+            convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+        uint16x8_t d2 =
+            convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+        uint16x8_t d3 =
+            convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+        uint16x8_t d4 =
+            convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+        uint16x8_t d5 =
+            convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+        uint16x8_t d6 =
+            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+        uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+                                      round_offset_vec);
+
+        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        uint16x8_t d0 =
+            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+
+        vst1q_u16(d, d0);
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+                                       const int16x4_t s2, const int16x4_t s3,
+                                       const int16x4_t s4, const int16x4_t s5,
+                                       const int16x4_t s6, const int16x4_t s7,
+                                       const int16x8_t y_filter,
+                                       const int16x4_t round_offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
+  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+  return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+                                       const int16x8_t s2, const int16x8_t s3,
+                                       const int16x8_t s4, const int16x8_t s5,
+                                       const int16x8_t s6, const int16x8_t s7,
+                                       const int16x8_t y_filter,
+                                       const int16x8_t round_offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
+  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+  return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
+    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+    const int dst8_stride, int w, int h, const int16x8_t y_filter,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int width = w;
+
+  if (w == 4 || h == 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+      uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+      uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+      int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+      int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+        int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+        int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+
+        __builtin_prefetch(d_u8 + 0 * dst8_stride);
+        __builtin_prefetch(d_u8 + 1 * dst8_stride);
+        __builtin_prefetch(d_u8 + 2 * dst8_stride);
+        __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+        uint16x4_t dd0, dd1, dd2, dd3;
+        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d01, d23;
+        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d01, &d23);
+
+        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        d_u8 += 4 * dst8_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s);
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        __builtin_prefetch(d);
+
+        uint16x4_t dd0 = vld1_u16(d);
+
+        uint8x8_t d01;
+        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+                                 vget_low_s16(round_offset_vec), &d01);
+
+        store_u8_4x1(d_u8, d01, 0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        d_u8 += dst8_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst8_ptr += 4;
+      width -= 4;
+    } while (width != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+      __builtin_prefetch(s + 4 * src_stride);
+      __builtin_prefetch(s + 5 * src_stride);
+      __builtin_prefetch(s + 6 * src_stride);
+      __builtin_prefetch(s + 7 * src_stride);
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      y_filter, round_offset_vec);
+
+        __builtin_prefetch(d + 0 * dst8_stride);
+        __builtin_prefetch(d + 1 * dst8_stride);
+        __builtin_prefetch(d + 2 * dst8_stride);
+        __builtin_prefetch(d + 3 * dst8_stride);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        uint16x8_t dd4, dd5, dd6, dd7;
+        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+                                 &d6_u8, &d7_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_offset_vec);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        __builtin_prefetch(d);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+                                 round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
+    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+    const int dst8_stride, int w, int h, const int16x8_t y_filter,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int width = w;
+
+  if (w == 4 || h == 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+      uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+      uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+      int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+      int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+        int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+        int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+
+        __builtin_prefetch(d_u8 + 0 * dst8_stride);
+        __builtin_prefetch(d_u8 + 1 * dst8_stride);
+        __builtin_prefetch(d_u8 + 2 * dst8_stride);
+        __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+        uint16x4_t dd0, dd1, dd2, dd3;
+        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d01, d23;
+        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d01, &d23);
+
+        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        d_u8 += 4 * dst8_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s);
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        __builtin_prefetch(d);
+
+        uint16x4_t dd0 = vld1_u16(d);
+
+        uint8x8_t d01;
+        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+        store_u8_4x1(d_u8, d01, 0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        d_u8 += dst8_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst8_ptr += 4;
+      width -= 4;
+    } while (width != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+      __builtin_prefetch(s + 4 * src_stride);
+      __builtin_prefetch(s + 5 * src_stride);
+      __builtin_prefetch(s + 6 * src_stride);
+      __builtin_prefetch(s + 7 * src_stride);
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      y_filter, round_offset_vec);
+
+        __builtin_prefetch(d + 0 * dst8_stride);
+        __builtin_prefetch(d + 1 * dst8_stride);
+        __builtin_prefetch(d + 2 * dst8_stride);
+        __builtin_prefetch(d + 3 * dst8_stride);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        uint16x8_t dd4, dd5, dd6, dd7;
+        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_offset_vec);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        __builtin_prefetch(d);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
+                                                 int src_stride, int w, int h,
+                                                 const int16x8_t y_filter,
+                                                 ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  int width = w;
+
+  if (w == 4 || h == 4) {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int height = h;
+
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+      uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+      uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+      int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+      int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+        int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+        int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      vget_low_s16(round_offset_vec));
+        uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        t0 = load_unaligned_u8_4x1(s);
+        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      vget_low_s16(round_offset_vec));
+
+        vst1_u16(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      width -= 4;
+    } while (width != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int height = h;
+
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+      __builtin_prefetch(s + 4 * src_stride);
+      __builtin_prefetch(s + 5 * src_stride);
+      __builtin_prefetch(s + 6 * src_stride);
+      __builtin_prefetch(s + 7 * src_stride);
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      round_offset_vec);
+        uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+                                      y_filter, round_offset_vec);
+        uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+                                      y_filter, round_offset_vec);
+
+        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8 * src_stride;
+        d += 8 * dst_stride;
+        height -= 8;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_offset_vec);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        vst1q_u16(d, d0);
+
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  // Vertical filter.
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  // Filter values are even, so downshift by 1 to reduce intermediate
+  // precision requirements.
+  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
+
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+  if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+            src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
+            conv_params);
+      } else {
+        dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
+                                          dst8, dst8_stride, w, h, y_filter,
+                                          conv_params);
+      }
+    } else {
+      dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
+                                    y_filter, conv_params);
+    }
+  } else {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
+                                                   dst8_stride, w, h, y_filter,
+                                                   conv_params);
+      } else {
+        dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
+                                          dst8_stride, w, h, y_filter,
+                                          conv_params);
+      }
+    } else {
+      dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
+                                    conv_params);
+    }
+  }
+}
diff --git a/av1/common/arm/compound_convolve_neon.h b/av1/common/arm/compound_convolve_neon.h
new file mode 100644
index 0000000..cff6838
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon.h
@@ -0,0 +1,1172 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "av1/common/convolve.h"
+#include "av1/common/enums.h"
+#include "av1/common/filter.h"
+
+static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+                                            const uint16_t fwd_offset,
+                                            const uint16_t bck_offset,
+                                            const int16x4_t round_offset,
+                                            uint8x8_t *d0_u8) {
+  uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
+  blend0 = vmlal_n_u16(blend0, d0, bck_offset);
+
+  uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
+
+  int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
+
+  int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
+
+  *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+                                         const int16x4_t round_offset,
+                                         uint8x8_t *d0_u8) {
+  uint16x4_t avg0 = vhadd_u16(dd0, d0);
+
+  int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
+
+  int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
+
+  *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+                                            const uint16_t fwd_offset,
+                                            const uint16_t bck_offset,
+                                            const int16x8_t round_offset,
+                                            uint8x8_t *d0_u8) {
+  uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
+  blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
+  uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
+  blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
+
+  uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
+                                 vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
+
+  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+
+  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+                                         const int16x8_t round_offset,
+                                         uint8x8_t *d0_u8) {
+  uint16x8_t avg0 = vhaddq_u16(dd0, d0);
+
+  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+
+  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_4x4(
+    uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
+    uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
+  uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
+  blend0 = vmlal_n_u16(blend0, d0, bck_offset);
+  uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
+  blend1 = vmlal_n_u16(blend1, d1, bck_offset);
+  uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
+  blend2 = vmlal_n_u16(blend2, d2, bck_offset);
+  uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
+  blend3 = vmlal_n_u16(blend3, d3, bck_offset);
+
+  uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
+  uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
+  uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
+  uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);
+
+  int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
+  int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
+
+  dst_01 = vsubq_s16(dst_01, round_offset);
+  dst_23 = vsubq_s16(dst_23, round_offset);
+
+  *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
+  *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
+                                         uint16x4_t dd2, uint16x4_t dd3,
+                                         uint16x4_t d0, uint16x4_t d1,
+                                         uint16x4_t d2, uint16x4_t d3,
+                                         const int16x8_t round_offset,
+                                         uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
+  uint16x4_t avg0 = vhadd_u16(dd0, d0);
+  uint16x4_t avg1 = vhadd_u16(dd1, d1);
+  uint16x4_t avg2 = vhadd_u16(dd2, d2);
+  uint16x4_t avg3 = vhadd_u16(dd3, d3);
+
+  int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
+  int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
+
+  dst_01 = vsubq_s16(dst_01, round_offset);
+  dst_23 = vsubq_s16(dst_23, round_offset);
+
+  *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
+  *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_8x4(
+    uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
+    uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8,
+    uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
+  uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
+  blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
+  uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
+  blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
+
+  uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
+  blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
+  uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
+  blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);
+
+  uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
+  blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
+  uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
+  blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);
+
+  uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
+  blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
+  uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
+  blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);
+
+  uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
+                                 vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
+  uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
+                                 vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
+  uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
+                                 vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
+  uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
+                                 vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));
+
+  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+  int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
+  int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
+  int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
+
+  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+  *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
+  *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
+  *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
+                                         uint16x8_t dd2, uint16x8_t dd3,
+                                         uint16x8_t d0, uint16x8_t d1,
+                                         uint16x8_t d2, uint16x8_t d3,
+                                         const int16x8_t round_offset,
+                                         uint8x8_t *d0_u8, uint8x8_t *d1_u8,
+                                         uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
+  uint16x8_t avg0 = vhaddq_u16(dd0, d0);
+  uint16x8_t avg1 = vhaddq_u16(dd1, d1);
+  uint16x8_t avg2 = vhaddq_u16(dd2, d2);
+  uint16x8_t avg3 = vhaddq_u16(dd3, d3);
+
+  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+  int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
+  int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
+  int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
+
+  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+  *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
+  *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
+  *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint16x4_t
+convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                 const int16x8_t y_filter, const int32x4_t offset_const) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = offset_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                 const int16x8_t y_filter, const int32x4_t offset_const) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = offset_const;
+  // Filter values at indices 0 and 7 are 0.
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+    src_ptr += 5 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 =
+          convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+      uint16x4_t d1 =
+          convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+      uint16x4_t d2 =
+          convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+      uint16x4_t d3 =
+          convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+      dst8_ptr += 4 * dst8_stride;
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s5 = vld1_s16(src_ptr);
+
+      uint16x4_t d0 =
+          convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+      uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+      uint8x8_t d01_u8;
+      compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+                               vget_low_s16(round_offset_vec), &d01_u8);
+
+      store_u8_4x1(dst8_ptr, d01_u8, 0);
+      dst8_ptr += dst8_stride;
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      int16_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+        uint16x8_t d1 =
+            convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+        uint16x8_t d2 =
+            convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+        uint16x8_t d3 =
+            convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vld1q_s16(s);
+
+        uint16x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+                                 round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
+    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+    src_ptr += 5 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 =
+          convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+      uint16x4_t d1 =
+          convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+      uint16x4_t d2 =
+          convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+      uint16x4_t d3 =
+          convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                            round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+      dst8_ptr += 4 * dst8_stride;
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s5 = vld1_s16(src_ptr);
+
+      uint16x4_t d0 =
+          convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+      uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+      uint8x8_t d01_u8;
+      compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
+
+      store_u8_4x1(dst8_ptr, d01_u8, 0);
+      dst8_ptr += dst8_stride;
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      int16_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+        uint16x8_t d1 =
+            convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+        uint16x8_t d2 =
+            convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+        uint16x8_t d3 =
+            convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vld1q_s16(s);
+
+        uint16x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
+    int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
+    const int16x8_t y_filter, int h, int w) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+    src_ptr += 5 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 =
+          convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+      uint16x4_t d1 =
+          convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+      uint16x4_t d2 =
+          convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+      uint16x4_t d3 =
+          convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s5 = vld1_s16(src_ptr);
+
+      uint16x4_t d0 =
+          convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+      vst1_u16(dst_ptr, d0);
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      int16_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+        uint16x8_t d1 =
+            convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+        uint16x8_t d2 =
+            convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+        uint16x8_t d3 =
+            convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vld1q_s16(s);
+
+        uint16x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+        vst1q_u16(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE uint16x4_t
+convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                 const int16x4_t s6, const int16x4_t s7,
+                 const int16x8_t y_filter, const int32x4_t offset_const) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = offset_const;
+  sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                 const int16x8_t s6, const int16x8_t s7,
+                 const int16x8_t y_filter, const int32x4_t offset_const) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = offset_const;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+  int32x4_t sum1 = offset_const;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src_ptr += 7 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                       offset_const);
+      uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                       offset_const);
+      uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                       offset_const);
+      uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                       y_filter, offset_const);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+      dst8_ptr += 4 * dst8_stride;
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s7 = vld1_s16(src_ptr);
+
+      uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                       offset_const);
+
+      uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+      uint8x8_t d01_u8;
+      compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+                               vget_low_s16(round_offset_vec), &d01_u8);
+
+      store_u8_4x1(dst8_ptr, d01_u8, 0);
+      dst8_ptr += dst8_stride;
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      int16_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_const);
+        uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+                                         y_filter, offset_const);
+        uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+                                         y_filter, offset_const);
+        uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                         y_filter, offset_const);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vld1q_s16(s);
+
+        uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_const);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+                                 round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
+    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src_ptr += 7 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                       offset_const);
+      uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                       offset_const);
+      uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                       offset_const);
+      uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                       y_filter, offset_const);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                            round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+      dst8_ptr += 4 * dst8_stride;
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s7 = vld1_s16(src_ptr);
+
+      uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                       offset_const);
+
+      uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+      uint8x8_t d01_u8;
+      compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
+
+      store_u8_4x1(dst8_ptr, d01_u8, 0);
+      dst8_ptr += dst8_stride;
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      int16_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_const);
+        uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+                                         y_filter, offset_const);
+        uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+                                         y_filter, offset_const);
+        uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                         y_filter, offset_const);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+        d_u8 += 4 * dst8_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vld1q_s16(s);
+
+        uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_const);
+
+        uint16x8_t dd0 = vld1q_u16(d);
+
+        uint8x8_t d0_u8;
+        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+        vst1_u8(d_u8, d0_u8);
+        d_u8 += dst8_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst8_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
+    int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
+    const int16x8_t y_filter, int h, int w) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src_ptr += 7 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                       offset_const);
+      uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                       offset_const);
+      uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                       offset_const);
+      uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                       y_filter, offset_const);
+
+      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s7 = vld1_s16(src_ptr);
+
+      uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                       offset_const);
+
+      vst1_u16(dst_ptr, d0);
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      int16_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_const);
+        uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+                                         y_filter, offset_const);
+        uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+                                         y_filter, offset_const);
+        uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                         y_filter, offset_const);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vld1q_s16(s);
+
+        uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_const);
+
+        vst1q_u16(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+#endif  // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/compound_convolve_neon_dotprod.c b/av1/common/arm/compound_convolve_neon_dotprod.c
new file mode 100644
index 0000000..8ab613d
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+                                         const int8x8_t x_filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16_t permute_tbl) {
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t clamped_samples =
+      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+                                         const int8x8_t x_filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product. */
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
+  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+  // Second 4 output values.
+  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
+  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16_t *x_filter_ptr, const int im_h, int w) {
+  const int bd = 8;
+  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+  // Dot product constants and other shims.
+  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+  const int32_t correction_s32 =
+      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+  // Fold horiz_const into the dot-product filter correction constant. The
+  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+  // rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
+                                           (1 << ((ROUND0_BITS - 1) - 1)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int height = im_h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 =
+          convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+
+      int16x4_t d0 =
+          convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+                                        permute_tbl);
+        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
+                                        permute_tbl);
+        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
+                                        permute_tbl);
+        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
+                                        permute_tbl);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+                                        permute_tbl);
+
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_2d_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+  dist_wtd_convolve_2d_horiz_neon_dotprod(src_ptr, src_stride, im_block,
+                                          im_stride, x_filter_ptr, im_h, w);
+
+  if (clamped_y_taps == 6) {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+            w);
+      } else {
+        dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+                                                dst8_stride, conv_params,
+                                                y_filter, h, w);
+      }
+    } else {
+      dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+                                          y_filter, h, w);
+    }
+  } else {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+            w);
+      } else {
+        dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+                                                dst8_stride, conv_params,
+                                                y_filter, h, w);
+      }
+    } else {
+      dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+                                          y_filter, h, w);
+    }
+  }
+}
+
+static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+                                       const int8x8_t x_filter,
+                                       const int32x4_t correction,
+                                       const uint8x16_t range_limit,
+                                       const uint8x16_t permute_tbl) {
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t clamped_samples =
+      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+                                       const int8x8_t x_filter,
+                                       const int32x4_t correction,
+                                       const uint8x16_t range_limit,
+                                       const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product. */
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
+  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+  // Second 4 output values.
+  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
+  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                               vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+  return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+  // Dot-product constants and other shims.
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const int32_t correction_s32 =
+      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+  // Fold round_offset into the dot-product filter correction constant. The
+  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+  // rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  int32x4_t correction =
+      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+                  (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  uint8_t *dst8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d1 =
+          convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d2 =
+          convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d3 =
+          convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 =
+            convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d1 =
+            convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d2 =
+            convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d3 =
+            convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+  // Dot-product constants and other shims.
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const int32_t correction_s32 =
+      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+  // Fold round_offset into the dot-product filter correction constant. The
+  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+  // rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  int32x4_t correction =
+      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+                  (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  uint8_t *dst8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d1 =
+          convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d2 =
+          convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d3 =
+          convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                            round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 =
+            convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d1 =
+            convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d2 =
+            convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d3 =
+            convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_x_neon_dotprod(
+    const uint8_t *src, int src_stride, int w, int h,
+    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+  // Dot-product constants and other shims.
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const int32_t correction_s32 =
+      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+  // Fold round_offset into the dot-product filter correction constant. The
+  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+  // rounding shifts - which are generally faster than rounding shifts on
+  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+  int32x4_t correction =
+      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+                  (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d1 =
+          convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d2 =
+          convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+      uint16x4_t d3 =
+          convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 =
+            convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d1 =
+            convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d2 =
+            convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+        uint16x8_t d3 =
+            convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_x_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  if (conv_params->do_average) {
+    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+      dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+          src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
+          subpel_x_qn, conv_params);
+    } else {
+      dist_wtd_convolve_x_avg_neon_dotprod(src, src_stride, dst8, dst8_stride,
+                                           w, h, filter_params_x, subpel_x_qn,
+                                           conv_params);
+    }
+  } else {
+    dist_wtd_convolve_x_neon_dotprod(src, src_stride, w, h, filter_params_x,
+                                     subpel_x_qn, conv_params);
+  }
+}
diff --git a/av1/common/arm/compound_convolve_neon_i8mm.c b/av1/common/arm/compound_convolve_neon_i8mm.c
new file mode 100644
index 0000000..70d7da9
--- /dev/null
+++ b/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+                                         const int8x8_t x_filter,
+                                         const uint8x16_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // First 4 output values.
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+                                         const int8x8_t x_filter,
+                                         const uint8x16x3_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
+  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+  // Second 4 output values.
+  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
+  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16_t *x_filter_ptr, const int im_h, int w) {
+  const int bd = 8;
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int height = im_h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+
+      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_2d_neon_i8mm(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+  dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride,
+                                       x_filter_ptr, im_h, w);
+
+  if (clamped_y_taps == 6) {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+            w);
+      } else {
+        dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+                                                dst8_stride, conv_params,
+                                                y_filter, h, w);
+      }
+    } else {
+      dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+                                          y_filter, h, w);
+    }
+  } else {
+    if (conv_params->do_average) {
+      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+        dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+            w);
+      } else {
+        dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+                                                dst8_stride, conv_params,
+                                                y_filter, h, w);
+      }
+    } else {
+      dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+                                          y_filter, h, w);
+    }
+  }
+}
+
+static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+                                       const int8x8_t x_filter,
+                                       const uint8x16_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // First 4 output values.
+  int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+                                       const int8x8_t x_filter,
+                                       const uint8x16x3_t permute_tbl,
+                                       const int32x4_t round_offset) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
+  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+  // Second 4 output values.
+  sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
+  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                               vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+  return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t round_offset_shim = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  uint8_t *dst8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d1 =
+          convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d2 =
+          convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d3 =
+          convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 =
+            convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d1 =
+            convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d2 =
+            convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d3 =
+            convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+                                 &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t round_offset_shim = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  uint8_t *dst8_ptr = dst8;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d1 =
+          convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d2 =
+          convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d3 =
+          convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+      uint16x4_t dd0, dd1, dd2, dd3;
+      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+      uint8x8_t d01_u8, d23_u8;
+      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                            round_offset_vec, &d01_u8, &d23_u8);
+
+      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
+      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
+      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
+      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      uint8_t *d_u8 = dst8_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 =
+            convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d1 =
+            convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d2 =
+            convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d3 =
+            convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+        uint16x8_t dd0, dd1, dd2, dd3;
+        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+        s += 8;
+        d += 8;
+        d_u8 += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      dst8_ptr += 4 * dst8_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_x_neon_i8mm(
+    const uint8_t *src, int src_stride, int w, int h,
+    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  assert(w % 4 == 0);
+  assert(h % 4 == 0);
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // (The extra -1 is needed because we halved the filter values.)
+  const int32x4_t round_offset_shim = vdupq_n_s32(
+      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+  // Horizontal filter.
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - horiz_offset;
+  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int height = h;
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      uint16x4_t d0 =
+          convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d1 =
+          convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d2 =
+          convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+      uint16x4_t d3 =
+          convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      CONV_BUF_TYPE *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint16x8_t d0 =
+            convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d1 =
+            convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d2 =
+            convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+        uint16x8_t d3 =
+            convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+void av1_dist_wtd_convolve_x_neon_i8mm(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params) {
+  if (conv_params->do_average) {
+    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+      dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+          src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
+          subpel_x_qn, conv_params);
+    } else {
+      dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w,
+                                        h, filter_params_x, subpel_x_qn,
+                                        conv_params);
+    }
+  } else {
+    dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x,
+                                  subpel_x_qn, conv_params);
+  }
+}
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 713aaad..fa98922 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -24,848 +24,19 @@
 #include "av1/common/filter.h"
 #include "av1/common/arm/convolve_neon.h"
 
-static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
-                                      const int16x4_t s2, const int16x4_t s3,
-                                      const int16x4_t s4, const int16x4_t s5,
-                                      const int16x4_t s6, const int16x4_t s7,
-                                      const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x4_t sum;
-
-  sum = vmul_lane_s16(s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-
-  return sum;
-}
-
-#if !AOM_ARCH_AARCH64
-static INLINE uint8x8_t convolve8_x_4x1(const int16x4_t s0, const int16x4_t s1,
-                                        const int16x4_t s2, const int16x4_t s3,
-                                        const int16x4_t s4, const int16x4_t s5,
-                                        const int16x4_t s6, const int16x4_t s7,
-                                        const int16x8_t filter,
-                                        const int16x4_t horiz_const) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x4_t sum = horiz_const;
-
-  sum = vmla_lane_s16(sum, s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-
-  // We halved the convolution filter values so - 1 from the right shift.
-  return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
-}
-#endif  // !AOM_ARCH_AARCH64
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int32x4_t convolve12_4_usdot(uint8x16_t samples,
-                                           const int8x16_t filters,
-                                           const uint8x16x3_t permute_tbl,
-                                           const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
-
-  return sum;
-}
-
-static INLINE int16x8_t convolve12_8_usdot(uint8x16_t samples0,
-                                           uint8x16_t samples1,
-                                           const int8x16_t filters,
-                                           const uint8x16x3_t permute_tbl,
-                                           const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[4];
-  int32x4_t sum[2];
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples0, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples0, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples0, permute_tbl.val[2]);
-  /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
-  permuted_samples[3] = vqtbl1q_u8(samples1, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  /* Second 4 output values. */
-  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
-  /* Narrow and re-pack. */
-  return vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
-                      vqrshrn_n_s32(sum[1], FILTER_BITS));
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x4_t convolve12_horiz_4_sdot(
-    uint8x16_t samples, const int8x16_t filters, const int32x4_t correction,
-    const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
-
-  /* Narrow and re-pack. */
-  return vshrn_n_s32(sum, ROUND0_BITS);
-}
-
-static INLINE int16x8_t convolve12_horiz_8_sdot(
-    uint8x16_t samples0, uint8x16_t samples1, const int8x16_t filters,
-    const int32x4_t correction, const uint8x16_t range_limit,
-    const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples[2], permuted_samples[4];
-  int32x4_t sum[2];
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples0, range_limit));
-  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples1, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
-  /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
-  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  /* Second 4 output values. */
-  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
-  /* Narrow and re-pack. */
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
-                      vshrn_n_s32(sum[1], ROUND0_BITS));
-}
-
-static INLINE int32x4_t convolve12_4_sdot(uint8x16_t samples,
-                                          const int8x16_t filters,
-                                          const int32x4_t correction,
-                                          const uint8x16_t range_limit,
-                                          const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
-
-  return sum;
-}
-
-static INLINE int16x8_t convolve12_8_sdot(uint8x16_t samples0,
-                                          uint8x16_t samples1,
-                                          const int8x16_t filters,
-                                          const int32x4_t correction,
-                                          const uint8x16_t range_limit,
-                                          const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples[2], permuted_samples[4];
-  int32x4_t sum[2];
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples0, range_limit));
-  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples1, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
-  /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
-  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  /* Second 4 output values. */
-  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
-  /* Narrow and re-pack. */
-  return vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
-                      vqrshrn_n_s32(sum[1], FILTER_BITS));
-}
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE uint8x8_t convolve8_vert_8x4(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x8_t sum;
-
-  sum = vmulq_lane_s16(s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_vert_4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-  int32x4_t sum;
-
-  sum = vmull_lane_s16(s0, y_filter_lo, 0);
-  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
-  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
-  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
-  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
-  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
-  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
-  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
-
-  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE uint8x8_t
-convolve8_vert_8_s32(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                     const int16x8_t s6, const int16x8_t s7,
-                     const int16x8_t y_filter, const int16x8_t sub_const) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-  int32x4_t sum0, sum1;
-  int16x8_t res;
-
-  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
-
-  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
-
-  res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
-                     vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
-  res = vsubq_s16(res, sub_const);
-
-  return vqmovun_s16(res);
-}
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-void convolve_x_sr_12tap_neon(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const int16_t *x_filter_ptr) {
-  const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
-  const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
-  const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
-  const int8x16_t filter =
-      vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
-
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
-    uint8x8_t d0;
-
-    // Undo the horizontal offset in the calling function.
-    src += 5;
-
-    for (int i = 0; i < h; i++) {
-      for (int j = 0; j < w; j += 8) {
-        d0 = vld1_u8(src + i * src_stride + j);
-        if (w == 2) {
-          store_u8_2x1(dst + i * dst_stride, d0, 0);
-        } else if (w == 4) {
-          store_u8_4x1(dst + i * dst_stride, d0, 0);
-        } else {
-          vst1_u8(dst + i * dst_stride + j, d0);
-        }
-      }
-    }
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-    // right shift by FILTER_BITS - instead of a first rounding right shift by
-    // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-    // ROUND0_BITS.
-    const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
-
-    if (w <= 4) {
-      uint8x16_t s0, s1, s2, s3;
-      int32x4_t d0, d1, d2, d3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23;
-
-      do {
-        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve12_4_usdot(s0, filter, permute_tbl, horiz_const);
-        d1 = convolve12_4_usdot(s1, filter, permute_tbl, horiz_const);
-        d2 = convolve12_4_usdot(s2, filter, permute_tbl, horiz_const);
-        d3 = convolve12_4_usdot(s3, filter, permute_tbl, horiz_const);
-
-        t01 = vcombine_s16(vqrshrn_n_s32(d0, FILTER_BITS),
-                           vqrshrn_n_s32(d1, FILTER_BITS));
-        t23 = vcombine_s16(vqrshrn_n_s32(d2, FILTER_BITS),
-                           vqrshrn_n_s32(d3, FILTER_BITS));
-
-        d01 = vqmovun_s16(t01);
-        d23 = vqmovun_s16(t23);
-
-        if (w == 2) {
-          store_u8_2x1(dst + 0 * dst_stride, d01, 0);
-          store_u8_2x1(dst + 1 * dst_stride, d01, 2);
-          if (h != 2) {
-            store_u8_2x1(dst + 2 * dst_stride, d23, 0);
-            store_u8_2x1(dst + 3 * dst_stride, d23, 2);
-          }
-        } else {
-          store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-          store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-          if (h != 2) {
-            store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-            store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-          }
-        }
-
-        dst += 4 * dst_stride;
-        src += 4 * src_stride;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-      int16x8_t d0, d1, d2, d3;
-      uint8x8_t dd0, dd1, dd2, dd3;
-
-      do {
-        const uint8_t *s = src;
-        uint8_t *d = dst;
-        int width = w;
-
-        do {
-          load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-          load_u8_16x4(s + 4, src_stride, &s4, &s5, &s6, &s7);
-
-          d0 = convolve12_8_usdot(s0, s4, filter, permute_tbl, horiz_const);
-          d1 = convolve12_8_usdot(s1, s5, filter, permute_tbl, horiz_const);
-          d2 = convolve12_8_usdot(s2, s6, filter, permute_tbl, horiz_const);
-          d3 = convolve12_8_usdot(s3, s7, filter, permute_tbl, horiz_const);
-
-          dd0 = vqmovun_s16(d0);
-          dd1 = vqmovun_s16(d1);
-          dd2 = vqmovun_s16(d2);
-          dd3 = vqmovun_s16(d3);
-
-          store_u8_8x2(d + 0 * dst_stride, dst_stride, dd0, dd1);
-          if (h != 2) {
-            store_u8_8x2(d + 2 * dst_stride, dst_stride, dd2, dd3);
-          }
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-        src += 4 * src_stride;
-        dst += 4 * dst_stride;
-        h -= 4;
-      } while (h > 0);
-    }
-  }
-}
-
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const int subpel_x_qn,
-                            ConvolveParams *conv_params) {
-  (void)conv_params;
-  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
-
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  src -= horiz_offset;
-
-  if (filter_params_x->taps > 8) {
-    convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
-                             x_filter_ptr);
-    return;
-  }
-
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
-  // rounding right shift by FILTER_BITS - instead of a first rounding right
-  // shift by ROUND0_BITS, followed by second rounding right shift by
-  // FILTER_BITS - ROUND0_BITS.
-  // The outermost -1 is needed because we halved the filter values.
-  const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
-
-  if (w <= 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int32x4_t t0, t1, t2, t3;
-    int16x8_t t01, t23;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
-      t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, horiz_const);
-      t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
-      t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
-
-      t01 = vcombine_s16(vmovn_s32(t0), vmovn_s32(t1));
-      t23 = vcombine_s16(vmovn_s32(t2), vmovn_s32(t3));
-
-      // We halved the convolution filter values so - 1 from the right shift.
-      d01 = vqrshrun_n_s16(t01, FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(t23, FILTER_BITS - 1);
-
-      if (w == 2) {
-        store_u8_2x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_2x1(dst + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(dst + 2 * dst_stride, d23, 0);
-          store_u8_2x1(dst + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-          store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-        }
-      }
-
-      h -= 4;
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-    } while (h > 0);
-
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t t0, t1, t2, t3;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      int width = w;
-      const uint8_t *s = src;
-      uint8_t *d = dst;
-
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        t0 = convolve8_x_8_usdot(s0, x_filter, permute_tbl, horiz_const);
-        t1 = convolve8_x_8_usdot(s1, x_filter, permute_tbl, horiz_const);
-        t2 = convolve8_x_8_usdot(s2, x_filter, permute_tbl, horiz_const);
-        t3 = convolve8_x_8_usdot(s3, x_filter, permute_tbl, horiz_const);
-
-        // We halved the convolution filter values so - 1 from the right shift.
-        d0 = vqrshrun_n_s16(t0, FILTER_BITS - 1);
-        d1 = vqrshrun_n_s16(t1, FILTER_BITS - 1);
-        d2 = vqrshrun_n_s16(t2, FILTER_BITS - 1);
-        d3 = vqrshrun_n_s16(t3, FILTER_BITS - 1);
-
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        if (h != 2) {
-          vst1_u8(d + 2 * dst_stride, d2);
-          vst1_u8(d + 3 * dst_stride, d3);
-        }
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void convolve_x_sr_12tap_neon(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const int16_t *x_filter_ptr) {
-  const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
-  const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
-  const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
-  const int8x16_t filter =
-      vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
-
-  const int32x4_t correct_tmp =
-      vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, 7)),
-                vpaddlq_s16(vshlq_n_s16(filter_8_15, 7)));
-  // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-  // right shift by FILTER_BITS - instead of a first rounding right shift by
-  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-  // ROUND0_BITS.
-  int32x4_t correction =
-      vdupq_n_s32(vaddvq_s32(correct_tmp) + (1 << (ROUND0_BITS - 1)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
-    uint8x8_t d0;
-
-    // Undo the horizontal offset in the calling function.
-    src += 5;
-
-    for (int i = 0; i < h; i++) {
-      for (int j = 0; j < w; j += 8) {
-        d0 = vld1_u8(src + i * src_stride + j);
-        if (w == 2) {
-          store_u8_2x1(dst + i * dst_stride, d0, 0);
-        } else if (w == 4) {
-          store_u8_4x1(dst + i * dst_stride, d0, 0);
-        } else {
-          vst1_u8(dst + i * dst_stride + j, d0);
-        }
-      }
-    }
-  } else {
-    if (w <= 4) {
-      uint8x16_t s0, s1, s2, s3;
-      int32x4_t d0, d1, d2, d3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23;
-
-      do {
-        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 =
-            convolve12_4_sdot(s0, filter, correction, range_limit, permute_tbl);
-        d1 =
-            convolve12_4_sdot(s1, filter, correction, range_limit, permute_tbl);
-        d2 =
-            convolve12_4_sdot(s2, filter, correction, range_limit, permute_tbl);
-        d3 =
-            convolve12_4_sdot(s3, filter, correction, range_limit, permute_tbl);
-
-        t01 = vcombine_s16(vqrshrn_n_s32(d0, FILTER_BITS),
-                           vqrshrn_n_s32(d1, FILTER_BITS));
-        t23 = vcombine_s16(vqrshrn_n_s32(d2, FILTER_BITS),
-                           vqrshrn_n_s32(d3, FILTER_BITS));
-
-        d01 = vqmovun_s16(t01);
-        d23 = vqmovun_s16(t23);
-
-        if (w == 2) {
-          store_u8_2x1(dst + 0 * dst_stride, d01, 0);
-          store_u8_2x1(dst + 1 * dst_stride, d01, 2);
-          if (h != 2) {
-            store_u8_2x1(dst + 2 * dst_stride, d23, 0);
-            store_u8_2x1(dst + 3 * dst_stride, d23, 2);
-          }
-        } else {
-          store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-          store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-          if (h != 2) {
-            store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-            store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-          }
-        }
-
-        dst += 4 * dst_stride;
-        src += 4 * src_stride;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-      int16x8_t d0, d1, d2, d3;
-      uint8x8_t dd0, dd1, dd2, dd3;
-
-      do {
-        const uint8_t *s = src;
-        uint8_t *d = dst;
-        int width = w;
-
-        do {
-          load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-          load_u8_16x4(s + 4, src_stride, &s4, &s5, &s6, &s7);
-
-          d0 = convolve12_8_sdot(s0, s4, filter, correction, range_limit,
-                                 permute_tbl);
-          d1 = convolve12_8_sdot(s1, s5, filter, correction, range_limit,
-                                 permute_tbl);
-          d2 = convolve12_8_sdot(s2, s6, filter, correction, range_limit,
-                                 permute_tbl);
-          d3 = convolve12_8_sdot(s3, s7, filter, correction, range_limit,
-                                 permute_tbl);
-
-          dd0 = vqmovun_s16(d0);
-          dd1 = vqmovun_s16(d1);
-          dd2 = vqmovun_s16(d2);
-          dd3 = vqmovun_s16(d3);
-
-          store_u8_8x2(d + 0 * dst_stride, dst_stride, dd0, dd1);
-          if (h != 2) {
-            store_u8_8x2(d + 2 * dst_stride, dst_stride, dd2, dd3);
-          }
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-        src += 4 * src_stride;
-        dst += 4 * dst_stride;
-        h -= 4;
-      } while (h > 0);
-    }
-  }
-}
-
-void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const int subpel_x_qn,
-                            ConvolveParams *conv_params) {
-  (void)conv_params;
-  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
-
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  src -= horiz_offset;
-
-  if (filter_params_x->taps > 8) {
-    convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
-                             x_filter_ptr);
-    return;
-  }
-
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-  // Dot product constants.
-  const int16x8_t correct_tmp = vshll_n_s8(x_filter, 7);
-  // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
-  // rounding right shift by FILTER_BITS - instead of a first rounding right
-  // shift by ROUND0_BITS, followed by second rounding right shift by
-  // FILTER_BITS - ROUND0_BITS.
-  // The outermost -1 is needed because we halved the filter values.
-  const int32x4_t correction =
-      vdupq_n_s32(vaddlvq_s16(correct_tmp) + (1 << ((ROUND0_BITS - 1) - 1)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
-  if (w <= 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int32x4_t t0, t1, t2, t3;
-    int16x8_t t01, t23;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
-
-      t01 = vcombine_s16(vmovn_s32(t0), vmovn_s32(t1));
-      t23 = vcombine_s16(vmovn_s32(t2), vmovn_s32(t3));
-
-      // We halved the convolution filter values so - 1 from the right shift.
-      d01 = vqrshrun_n_s16(t01, FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(t23, FILTER_BITS - 1);
-
-      if (w == 2) {
-        store_u8_2x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_2x1(dst + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(dst + 2 * dst_stride, d23, 0);
-          store_u8_2x1(dst + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-          store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-        }
-      }
-
-      h -= 4;
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t t0, t1, t2, t3;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      int width = w;
-      const uint8_t *s = src;
-      uint8_t *d = dst;
-
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        t0 = convolve8_x_8_sdot(s0, x_filter, correction, range_limit,
-                                permute_tbl);
-        t1 = convolve8_x_8_sdot(s1, x_filter, correction, range_limit,
-                                permute_tbl);
-        t2 = convolve8_x_8_sdot(s2, x_filter, correction, range_limit,
-                                permute_tbl);
-        t3 = convolve8_x_8_sdot(s3, x_filter, correction, range_limit,
-                                permute_tbl);
-
-        // We halved the convolution filter values so - 1 from the right shift.
-        d0 = vqrshrun_n_s16(t0, FILTER_BITS - 1);
-        d1 = vqrshrun_n_s16(t1, FILTER_BITS - 1);
-        d2 = vqrshrun_n_s16(t2, FILTER_BITS - 1);
-        d3 = vqrshrun_n_s16(t3, FILTER_BITS - 1);
-
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        if (h != 2) {
-          vst1_u8(d + 2 * dst_stride, d2);
-          vst1_u8(d + 3 * dst_stride, d3);
-        }
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-#else  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE uint8x8_t
-convolve8_horiz_8x8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                    const int16x8_t s6, const int16x8_t s7,
-                    const int16x8_t filter, const int16x8_t horiz_const) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x8_t sum = horiz_const;
-
-  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
-  // We halved the convolution filter values so - 1 from the right shift.
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve12_x_4x4_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
-    const int32x4_t horiz_const) {
+static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
+                                       const int16x4_t s2, const int16x4_t s3,
+                                       const int16x4_t s4, const int16x4_t s5,
+                                       const int16x4_t s6, const int16x4_t s7,
+                                       const int16x4_t s8, const int16x4_t s9,
+                                       const int16x4_t s10, const int16x4_t s11,
+                                       const int16x8_t x_filter_0_7,
+                                       const int16x4_t x_filter_8_11,
+                                       const int32x4_t horiz_const) {
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
-  int32x4_t sum = horiz_const;
 
+  int32x4_t sum = horiz_const;
   sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
   sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
   sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
@@ -882,69 +53,6 @@
   return vqrshrn_n_s32(sum, FILTER_BITS);
 }
 
-// 4 column per iteration filtering for 12-tap convolve_x_sr.
-// Processes one row at a time.
-static INLINE void x_filter_12tap_w4_single_row(
-    const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr,
-    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11) {
-  // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single
-  // rounding right shift by FILTER_BITS - instead of a first rounding right
-  // shift by ROUND0_BITS, followed by second rounding right shift by
-  // FILTER_BITS - ROUND0_BITS.
-  const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
-
-  do {
-    const uint8_t *s = src_ptr;
-    uint8_t *d = dst_ptr;
-    int width = w;
-
-    do {
-      uint8x8_t dd0;
-      uint8x16_t t0;
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, d0;
-      int16x8_t tt0, tt1;
-
-      t0 = vld1q_u8(s);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
-
-      s0 = vget_low_s16(tt0);
-      s4 = vget_high_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s12 = vget_high_s16(tt1);
-
-      s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
-      s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
-      s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
-      s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
-      s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
-      s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
-      s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
-      s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
-      s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
-
-      d0 = convolve12_x_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                s11, x_filter_0_7, x_filter_8_11, horiz_const);
-
-      dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
-
-      if (w == 2) {
-        store_u8_2x1(d, dd0, 0);
-      } else {
-        store_u8_4x1(d, dd0, 0);
-      }
-
-      s += 4;
-      d += 4;
-      width -= 4;
-    } while (width > 0);
-
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  } while (--h != 0);
-}
-
 static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
                                             int src_stride, uint8_t *dst_ptr,
                                             const int dst_stride, int w, int h,
@@ -952,87 +60,71 @@
   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
 
-#if AOM_ARCH_AARCH64
-  // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single
-  // rounding right shift by FILTER_BITS - instead of a first rounding right
-  // shift by ROUND0_BITS, followed by second rounding right shift by
-  // FILTER_BITS - ROUND0_BITS.
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
+  // shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS.
   const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
 
+#if AOM_ARCH_AARCH64
   do {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint8x8_t t0, t1, t2, t3;
-
     const uint8_t *s = src_ptr;
     uint8_t *d = dst_ptr;
     int width = w;
 
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
 
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
     load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
 
-    s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
     s += 11;
 
     do {
-      int16x4_t s11, s12, s13, s14, d0, d1, d2, d3;
-      int16x8_t d01, d23;
-      uint8x8_t dd01, dd23;
-
       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
 
-      s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-      d0 = convolve12_x_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                s11, x_filter_0_7, x_filter_8_11, horiz_const);
-      d1 = convolve12_x_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                s12, x_filter_0_7, x_filter_8_11, horiz_const);
-      d2 = convolve12_x_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                s13, x_filter_0_7, x_filter_8_11, horiz_const);
-      d3 = convolve12_x_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                                s14, x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d0 =
+          convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                         x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d1 =
+          convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                         x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d2 =
+          convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                         x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d3 =
+          convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+                         x_filter_0_7, x_filter_8_11, horiz_const);
 
-      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
 
-      d01 = vcombine_s16(d0, d1);
-      d23 = vcombine_s16(d2, d3);
+      uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+      uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
 
-      dd01 = vqmovun_s16(d01);
-      dd23 = vqmovun_s16(d23);
-
-      if (w == 2) {
-        store_u8_2x1(d + 0 * dst_stride, dd01, 0);
-        store_u8_2x1(d + 1 * dst_stride, dd01, 2);
-        if (h != 2) {
-          store_u8_2x1(d + 2 * dst_stride, dd23, 0);
-          store_u8_2x1(d + 3 * dst_stride, dd23, 2);
-        }
-      } else {
-        store_u8_4x1(d + 0 * dst_stride, dd01, 0);
-        store_u8_4x1(d + 1 * dst_stride, dd01, 1);
-        if (h != 2) {
-          store_u8_4x1(d + 2 * dst_stride, dd23, 0);
-          store_u8_4x1(d + 3 * dst_stride, dd23, 1);
-        }
-      }
+      store_u8_4x1(d + 0 * dst_stride, d01, 0);
+      store_u8_4x1(d + 1 * dst_stride, d01, 1);
+      store_u8_4x1(d + 2 * dst_stride, d23, 0);
+      store_u8_4x1(d + 3 * dst_stride, d23, 1);
 
       s0 = s4;
       s1 = s5;
@@ -1048,34 +140,109 @@
       s += 4;
       d += 4;
       width -= 4;
-    } while (width > 0);
-
+    } while (width != 0);
     src_ptr += 4 * src_stride;
     dst_ptr += 4 * dst_stride;
     h -= 4;
-  } while (h >= 4);
+  } while (h != 0);
 
-  if (h > 0) {
-    x_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w, h,
-                                 x_filter_0_7, x_filter_8_11);
-  }
 #else   // !AOM_ARCH_AARCH64
-  x_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w, h,
-                               x_filter_0_7, x_filter_8_11);
+  do {
+    const uint8_t *s = src_ptr;
+    uint8_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x16_t t0 = vld1q_u8(s);
+      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+
+      int16x4_t s0 = vget_low_s16(tt0);
+      int16x4_t s4 = vget_high_s16(tt0);
+      int16x4_t s8 = vget_low_s16(tt8);
+      int16x4_t s12 = vget_high_s16(tt8);
+
+      int16x4_t s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
+      int16x4_t s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
+      int16x4_t s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
+      int16x4_t s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
+      int16x4_t s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
+      int16x4_t s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
+      int16x4_t s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
+
+      int16x4_t d0 =
+          convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                         x_filter_0_7, x_filter_8_11, horiz_const);
+
+      uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
+
+      store_u8_4x1(d, dd0, 0);
+
+      s += 4;
+      d += 4;
+      width -= 4;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--h != 0);
 #endif  // AOM_ARCH_AARCH64
 }
 
+static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t filter,
+                                      const int16x4_t horiz_const) {
+  int16x4_t sum = horiz_const;
+  sum = vmla_lane_s16(sum, s0, filter, 0);
+  sum = vmla_lane_s16(sum, s1, filter, 1);
+  sum = vmla_lane_s16(sum, s2, filter, 2);
+  sum = vmla_lane_s16(sum, s3, filter, 3);
+
+  // We halved the convolution filter values so - 1 from the right shift.
+  return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter,
+                                      const int16x8_t horiz_const) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  // We halved the convolution filter values so - 1 from the right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const int subpel_x_qn,
                             ConvolveParams *conv_params) {
-  (void)conv_params;
+  if (w == 2 || h == 2) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    return;
+  }
+
   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  src -= horiz_offset;
 
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  src -= horiz_offset;
 
   if (filter_params_x->taps > 8) {
     convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
@@ -1083,503 +250,303 @@
     return;
   }
 
-  uint8x8_t t0;
-#if AOM_ARCH_AARCH64
-  uint8x8_t t1, t2, t3;
   // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
   // rounding right shift by FILTER_BITS - instead of a first rounding right
   // shift by ROUND0_BITS, followed by second rounding right shift by
   // FILTER_BITS - ROUND0_BITS.
-  // The outermost -1 is needed because we halved the filter values.
+  // The outermost -1 is needed because we will halve the filter values.
   const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-#endif  // AOM_ARCH_AARCH64
-  // Filter values are even so downshift by 1 to reduce precision requirements.
-  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-#if AOM_ARCH_AARCH64
-  if (h == 4) {
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t d01_temp, d23_temp;
+  if (w <= 4) {
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
 
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
-
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
-    src += 7;
+    src += 2;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
 
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
 
-      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
-      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
-      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
-      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+      uint8x8_t d0 =
+          convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
 
-      d01_temp = vcombine_s16(d0, d1);
-      d23_temp = vcombine_s16(d2, d3);
-
-      d01_temp = vaddq_s16(d01_temp, horiz_const);
-      d23_temp = vaddq_s16(d23_temp, horiz_const);
-
-      // We halved the convolution filter values so - 1 from the right shift.
-      d01 = vqrshrun_n_s16(d01_temp, FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(d23_temp, FILTER_BITS - 1);
-
-      transpose_u8_4x4(&d01, &d23);
-
-      if (w == 2) {
-        store_u8_2x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_2x1(dst + 1 * dst_stride, d23, 0);
-        store_u8_2x1(dst + 2 * dst_stride, d01, 2);
-        store_u8_2x1(dst + 3 * dst_stride, d23, 2);
-      } else {
-        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_4x1(dst + 1 * dst_stride, d23, 0);
-        store_u8_4x1(dst + 2 * dst_stride, d01, 1);
-        store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-      }
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      src += 4;
-      dst += 4;
-      w -= 4;
-    } while (w > 0);
-  } else {
-#endif  // AOM_ARCH_AARCH64
-    int width;
-    const uint8_t *s;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10;
-    uint8x8_t t4, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    if (w <= 4) {
-#if AOM_ARCH_AARCH64
-      do {
-        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
-                    &t7);
-        src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
-
-        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                 horiz_const);
-        t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                                 horiz_const);
-        t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                                 horiz_const);
-        t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                                 horiz_const);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        if (w == 4) {
-          store_u8_4x1(dst + 0 * dst_stride, t0, 0);
-          store_u8_4x1(dst + 1 * dst_stride, t1, 0);
-          if (h > 4) {
-            store_u8_4x1(dst + 2 * dst_stride, t2, 0);
-            store_u8_4x1(dst + 3 * dst_stride, t3, 0);
-            store_u8_4x1(dst + 4 * dst_stride, t0, 1);
-            store_u8_4x1(dst + 5 * dst_stride, t1, 1);
-            store_u8_4x1(dst + 6 * dst_stride, t2, 1);
-            store_u8_4x1(dst + 7 * dst_stride, t3, 1);
-          }
-        } else if (w == 2) {
-          store_u8_2x1(dst + 0 * dst_stride, t0, 0);
-          store_u8_2x1(dst + 1 * dst_stride, t1, 0);
-          if (h > 4) {
-            store_u8_2x1(dst + 2 * dst_stride, t2, 0);
-            store_u8_2x1(dst + 3 * dst_stride, t3, 0);
-            store_u8_2x1(dst + 4 * dst_stride, t0, 2);
-            store_u8_2x1(dst + 5 * dst_stride, t1, 2);
-            store_u8_2x1(dst + 6 * dst_stride, t2, 2);
-            store_u8_2x1(dst + 7 * dst_stride, t3, 2);
-          }
-        }
-
-        dst += 8 * dst_stride;
-        h -= 8;
-      } while (h > 0);
-#else   // !AOM_ARCH_AARCH64
-    // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
-    // rounding right shift by FILTER_BITS - instead of a first rounding right
-    // shift by ROUND0_BITS, followed by second rounding right shift by
-    // FILTER_BITS - ROUND0_BITS.
-    // The outermost -1 is needed because we halved the filter values.
-    const int16x4_t horiz_const = vdup_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-    int16x8_t tt0;
-    int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
-
-    do {
-      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
-      x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
-
-      t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
-
-      x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
-      x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
-      x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
-      x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
-      x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
-      x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
+      store_u8_4x1(dst, d0, 0);
 
       src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {
+    // Filter values are even so halve to reduce precision requirements.
+    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-      t0 = convolve8_x_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
-                           horiz_const);
-
-      if (w == 4) {
-        store_u8_4x1(dst, t0, 0);
-        dst += dst_stride;
-      } else if (w == 2) {
-        store_u8_2x1(dst, t0, 0);
-        dst += dst_stride;
-      }
-      h -= 1;
-    } while (h > 0);
-#endif  // AOM_ARCH_AARCH64
-    } else {
-      uint8_t *d;
-      int16x8_t s11;
 #if AOM_ARCH_AARCH64
-      int16x8_t s12, s13, s14;
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+    while (h >= 8) {
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-        do {
-          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+      int width = w;
+      const uint8_t *s = src + 7;
+      uint8_t *d = dst;
 
-          t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                   horiz_const);
-          t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                                   horiz_const);
-          t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                                   horiz_const);
-          t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                                   horiz_const);
-          t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                                   horiz_const);
-          t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
-                                   horiz_const);
-          t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
-                                   horiz_const);
-          t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
-                                   x_filter, horiz_const);
-
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-          if (h != 2) {
-            store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
-          } else {
-            store_u8_8x2(d, dst_stride, t0, t1);
-          }
-
-          s0 = s8;
-          s1 = s9;
-          s2 = s10;
-          s3 = s11;
-          s4 = s12;
-          s5 = s13;
-          s6 = s14;
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-        src += 8 * src_stride;
-        dst += 8 * dst_stride;
-        h -= 8;
-      } while (h > 0);
-#else   // !AOM_ARCH_AARCH64
-    // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
-    // rounding right shift by FILTER_BITS - instead of a first rounding right
-    // shift by ROUND0_BITS, followed by second rounding right shift by
-    // FILTER_BITS - ROUND0_BITS.
-    // The outermost -1 is needed because we halved the filter values.
-    const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
-
-    do {
-      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
-      width = w;
-      s = src + 8;
-      d = dst;
-      __builtin_prefetch(dst);
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      __builtin_prefetch(d + 4 * dst_stride);
+      __builtin_prefetch(d + 5 * dst_stride);
+      __builtin_prefetch(d + 6 * dst_stride);
+      __builtin_prefetch(d + 7 * dst_stride);
 
       do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s11 = s0;
-        s0 = s7;
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
 
-        s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+        transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
+                                       &t14);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
 
-        t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                 horiz_const);
+        uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                     horiz_const);
+        uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                     horiz_const);
+        uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                     horiz_const);
+        uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                     horiz_const);
+        uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                     horiz_const);
+        uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+                                     x_filter, horiz_const);
+        uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+                                     x_filter, horiz_const);
+        uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+                                     x_filter, horiz_const);
 
-        vst1_u8(d, t0);
+        transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
+        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
+      src += 8 * src_stride;
+      dst += 8 * dst_stride;
+      h -= 8;
+    }
+#endif  // AOM_ARCH_AARCH64
+
+    while (h-- != 0) {
+      uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      int width = w;
+      const uint8_t *s = src + 8;
+      uint8_t *d = dst;
+
+      __builtin_prefetch(d);
+
+      do {
+        uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+
+        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                     horiz_const);
+
+        vst1_u8(d, d0);
+
+        s0 = s8;
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
       src += src_stride;
       dst += dst_stride;
-      h -= 1;
-    } while (h > 0);
-#endif  // AOM_ARCH_AARCH64
     }
-#if AOM_ARCH_AARCH64
   }
-#endif  // AOM_ARCH_AARCH64
 }
 
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x8_t y_filter_0_7) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+  // Filter values at indices 0 and 7 are 0.
+  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
+  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t y_filters) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filters);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filters);
+
+  // Filter values at indices 0 and 7 are 0.
+  int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
+  // We halved the convolution filter values so -1 from the right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
 
 static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
                                            int src_stride, uint8_t *dst_ptr,
                                            const int dst_stride, int w, int h,
-                                           const int16x8_t y_filter_0_7) {
+                                           const int16x8_t y_filter) {
   if (w <= 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5;
-    int16x4_t s0, s1, s2, s3, s4, s5, d0;
-    uint8x8_t d01;
+    uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+    uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+    uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+    uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+    uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
 
-#if AOM_ARCH_AARCH64
-    uint8x8_t t6, t7, t8;
-    int16x4_t s6, s7, s8, d1, d2, d3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
 
-    const uint8_t *s = src_ptr + src_stride;
-    uint8_t *d = dst_ptr;
-
-    load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
-    s += 5 * src_stride;
+    src_ptr += 5 * src_stride;
 
     do {
 #if AOM_ARCH_AARCH64
-      load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
-      s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
-      s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+      uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+      uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+      uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
 
-      d0 = convolve6_4x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
-      d1 = convolve6_4x4(s1, s2, s3, s4, s5, s6, y_filter_0_7);
-      d2 = convolve6_4x4(s2, s3, s4, s5, s6, s7, y_filter_0_7);
-      d3 = convolve6_4x4(s3, s4, s5, s6, s7, s8, y_filter_0_7);
+      int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+      int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+      int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
+      int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
+      int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
+      int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
 
-      if (w == 2) {
-        store_u8_2x1(d + 0 * dst_stride, d01, 0);
-        store_u8_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(d + 2 * dst_stride, d23, 0);
-          store_u8_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(d + 0 * dst_stride, d01, 0);
-        store_u8_4x1(d + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(d + 2 * dst_stride, d23, 0);
-          store_u8_4x1(d + 3 * dst_stride, d23, 1);
-        }
-      }
+      // We halved the convolution filter values so -1 from the right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
 
       s0 = s4;
       s1 = s5;
       s2 = s6;
       s3 = s7;
       s4 = s8;
-      s += 4 * src_stride;
-      d += 4 * dst_stride;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
       h -= 4;
 #else   // !AOM_ARCH_AARCH64
-      t5 = vld1_u8(s);
-      s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+      uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
+      int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
 
-      d0 = convolve6_4x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
+      int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
+      // We halved the convolution filter values so -1 from the right shift.
+      uint8x8_t d01 =
+          vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
 
-      if (w == 2) {
-        store_u8_2x1(d, d01, 0);
-      } else {
-        store_u8_4x1(d, d01, 0);
-      }
+      store_u8_4x1(dst_ptr, d01, 0);
 
       s0 = s1;
       s1 = s2;
       s2 = s3;
       s3 = s4;
       s4 = s5;
-      s += src_stride;
-      d += dst_stride;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
       h--;
 #endif  // AOM_ARCH_AARCH64
-    } while (h > 0);
+    } while (h != 0);
+
   } else {
-    // if width is a multiple of 8 & height is a multiple of 4
-    uint8x8_t t0, t1, t2, t3, t4, t5;
-    int16x8_t s0, s1, s2, s3, s4, s5, dd0;
-    uint8x8_t d0;
-#if AOM_ARCH_AARCH64
-    uint8x8_t t6, t7, t8;
-    int16x8_t s6, s7, s8, dd1, dd2, dd3;
-    uint8x8_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
     do {
-      int height = h;
-      const uint8_t *s = src_ptr + src_stride;
+      const uint8_t *s = src_ptr;
       uint8_t *d = dst_ptr;
+      int height = h;
 
+      uint8x8_t t0, t1, t2, t3, t4;
       load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
       s += 5 * src_stride;
 
       do {
 #if AOM_ARCH_AARCH64
+        uint8x8_t t5, t6, t7, t8;
         load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
 
-        dd0 = convolve6_8x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
-        dd1 = convolve6_8x4(s1, s2, s3, s4, s5, s6, y_filter_0_7);
-        dd2 = convolve6_8x4(s2, s3, s4, s5, s6, s7, y_filter_0_7);
-        dd3 = convolve6_8x4(s3, s4, s5, s6, s7, s8, y_filter_0_7);
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
 
-        d0 = vqrshrun_n_s16(dd0, FILTER_BITS - 1);
-        d1 = vqrshrun_n_s16(dd1, FILTER_BITS - 1);
-        d2 = vqrshrun_n_s16(dd2, FILTER_BITS - 1);
-        d3 = vqrshrun_n_s16(dd3, FILTER_BITS - 1);
+        uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
+        uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
+        uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
+        uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
 
-        if (h != 2) {
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-        } else {
-          store_u8_8x2(d, dst_stride, d0, d1);
-        }
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -1590,11 +557,9 @@
         d += 4 * dst_stride;
         height -= 4;
 #else   // !AOM_ARCH_AARCH64
-        t5 = vld1_u8(s);
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
 
-        dd0 = convolve6_8x4(s0, s1, s2, s3, s4, s5, y_filter_0_7);
-        d0 = vqrshrun_n_s16(dd0, FILTER_BITS - 1);
+        uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
 
         vst1_u8(d, d0);
 
@@ -1607,21 +572,217 @@
         d += dst_stride;
         height--;
 #endif  // AOM_ARCH_AARCH64
-      } while (height > 0);
-
+      } while (height != 0);
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
-static INLINE int16x4_t convolve12_y_4x4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x8_t s4, const int16x8_t s5,
+                                      const int16x8_t s6, const int16x8_t s7,
+                                      const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
+                                           int src_stride, uint8_t *dst_ptr,
+                                           const int dst_stride, int w, int h,
+                                           const int16x8_t y_filter) {
+  if (w <= 4) {
+    uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+    uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+    uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+    uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+    uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
+    uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
+    uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
+
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src_ptr += 7 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+      uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+      uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+      uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      // We halved the convolution filter values so -1 from the right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+
+      int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      // We halved the convolution filter values so -1 from the right shift.
+      uint8x8_t d01 =
+          vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
+
+      store_u8_4x1(dst_ptr, d01, 0);
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+      int height = h;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+        uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+        uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+        uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+        uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+        vst1_u8(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
+                                       const int16x4_t s2, const int16x4_t s3,
+                                       const int16x4_t s4, const int16x4_t s5,
+                                       const int16x4_t s6, const int16x4_t s7,
+                                       const int16x4_t s8, const int16x4_t s9,
+                                       const int16x4_t s10, const int16x4_t s11,
+                                       const int16x8_t y_filter_0_7,
+                                       const int16x4_t y_filter_8_11) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
   int16x4_t sum;
@@ -1638,22 +799,22 @@
   sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
   sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
 
-  // Separate out the two filter values in the middle of the kernel that have
-  // the largest magnitude and use saturating addition to prevent overflow. This
-  // means we can stay at 16-bit elements, rather than having to widen
-  // everything to a 32-bit result, requiring twice the number of instructions.
+  // Saturating addition is required for the largest filter taps to avoid
+  // overflow (while staying in 16-bit elements.)
   sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
   sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
 
   return sum;
 }
 
-static INLINE uint8x8_t convolve12_y_8x4_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
-    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
+                                       const int16x8_t s2, const int16x8_t s3,
+                                       const int16x8_t s4, const int16x8_t s5,
+                                       const int16x8_t s6, const int16x8_t s7,
+                                       const int16x8_t s8, const int16x8_t s9,
+                                       const int16x8_t s10, const int16x8_t s11,
+                                       const int16x8_t y_filter_0_7,
+                                       const int16x4_t y_filter_8_11) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
   int16x8_t sum;
@@ -1670,10 +831,8 @@
   sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
   sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
 
-  // Separate out the two filter values in the middle of the kernel that have
-  // the largest magnitude and use saturating addition to prevent overflow. This
-  // means we can stay at 16-bit elements, rather than having to widen
-  // everything to a 32-bit result, requiring twice the number of instructions.
+  // Saturating addition is required for the largest filter taps to avoid
+  // overflow (while staying in 16-bit elements.)
   sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
   sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
 
@@ -1684,98 +843,52 @@
                                             int src_stride, uint8_t *dst_ptr,
                                             int dst_stride, int w, int h,
                                             const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the horizontal offset in the calling function
-    src_ptr += 5 * src_stride;
-
-    if (w <= 4) {
-      for (int i = 0; i < h; i += 2) {
-        uint8x8_t d0 = load_unaligned_u8(src_ptr + i * src_stride, src_stride);
-        if (w == 2) {
-          store_u8_2x1(dst_ptr + i * dst_stride, d0, 0);
-          store_u8_2x1(dst_ptr + (i + 1) * dst_stride, d0, 1);
-        } else if (w == 4) {
-          store_u8_4x1(dst_ptr + i * dst_stride, d0, 0);
-          store_u8_4x1(dst_ptr + (i + 1) * dst_stride, d0, 1);
-        }
-      }
-    } else {
-      for (int i = 0; i < h; i++) {
-        for (int j = 0; j < w; j += 8) {
-          uint8x8_t d0 = vld1_u8(src_ptr + i * src_stride + j);
-          vst1_u8(dst_ptr + i * dst_stride + j, d0);
-        }
-      }
-    }
-    return;
-  }
-
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
 
   if (w <= 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-    int16x4_t d0, d1, d2, d3;
-    int16x8_t dd01, dd23;
-    uint8x8_t d01, d23;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
     load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
                  &t8, &t9, &t10);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
-    s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
-    s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
-    s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
-    s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
-    s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
-    s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+    int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+    int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
 
     src_ptr += 11 * src_stride;
 
     do {
+      uint8x8_t t11, t12, t13, t14;
       load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
-      s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
-      s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
-      s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
-      s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
 
-      d0 = convolve12_y_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                s11, y_filter_0_7, y_filter_8_11);
-      d1 = convolve12_y_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                s12, y_filter_0_7, y_filter_8_11);
-      d2 = convolve12_y_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                s13, y_filter_0_7, y_filter_8_11);
-      d3 = convolve12_y_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                                s14, y_filter_0_7, y_filter_8_11);
+      int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
+      int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
+      int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
+      int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
 
-      dd01 = vcombine_s16(d0, d1);
-      dd23 = vcombine_s16(d2, d3);
+      int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                    s11, y_filter_0_7, y_filter_8_11);
+      int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                    s11, s12, y_filter_0_7, y_filter_8_11);
+      int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                    s12, s13, y_filter_0_7, y_filter_8_11);
+      int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                    s13, s14, y_filter_0_7, y_filter_8_11);
 
-      d01 = vqrshrun_n_s16(dd01, FILTER_BITS);
-      d23 = vqrshrun_n_s16(dd23, FILTER_BITS);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      if (w == 2) {
-        store_u8_2x1(dst_ptr + 0 * dst_stride, d01, 0);
-        store_u8_2x1(dst_ptr + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(dst_ptr + 2 * dst_stride, d23, 0);
-          store_u8_2x1(dst_ptr + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
-        store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
-          store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
-        }
-      }
+      store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
 
       s0 = s4;
       s1 = s5;
@@ -1791,54 +904,50 @@
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
-  } else {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
-    uint8x8_t d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+    } while (h != 0);
 
+  } else {
     do {
       const uint8_t *s = src_ptr;
       uint8_t *d = dst_ptr;
       int height = h;
 
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
       load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
                    &t9, &t10);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-      s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
-      s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
-      s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
-      s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
 
       s += 11 * src_stride;
 
       do {
+        uint8x8_t t11, t12, t13, t14;
         load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
 
-        d0 = convolve12_y_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                  s11, y_filter_0_7, y_filter_8_11);
-        d1 = convolve12_y_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                  s12, y_filter_0_7, y_filter_8_11);
-        d2 = convolve12_y_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                  s13, y_filter_0_7, y_filter_8_11);
-        d3 = convolve12_y_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                  s13, s14, y_filter_0_7, y_filter_8_11);
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
 
-        if (h != 2) {
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-        } else {
-          store_u8_8x2(d, dst_stride, d0, d1);
-        }
+        uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+                                      s10, s11, y_filter_0_7, y_filter_8_11);
+        uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                      s11, s12, y_filter_0_7, y_filter_8_11);
+        uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                      s12, s13, y_filter_0_7, y_filter_8_11);
+        uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                      s13, s14, y_filter_0_7, y_filter_8_11);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -1854,12 +963,11 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
-
+      } while (height != 0);
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -1867,8 +975,15 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_y_qn) {
+  if (w == 2 || h == 2) {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                        subpel_y_qn);
+    return;
+  }
+
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
-  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int vert_offset = clamped_y_taps / 2 - 1;
 
   src -= vert_offset * src_stride;
 
@@ -1881,635 +996,27 @@
     return;
   }
 
-  // Filter values are even so downshift by 1 to reduce precision requirements.
+  // Filter values are even so halve to reduce precision requirements.
   const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
 
   if (y_filter_taps < 8) {
     convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
-    return;
-  }
-
-  if (w <= 4) {
-    uint8x8_t d01;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if AOM_ARCH_AARCH64
-    uint8x8_t d23;
-    int16x4_t s8, s9, s10, d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-
-    do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-#if AOM_ARCH_AARCH64
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
-      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
-      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      if (w == 2) {
-        store_u8_2x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_2x1(dst + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(dst + 2 * dst_stride, d23, 0);
-          store_u8_2x1(dst + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
-        store_u8_4x1(dst + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(dst + 2 * dst_stride, d23, 0);
-          store_u8_4x1(dst + 3 * dst_stride, d23, 1);
-        }
-      }
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      dst += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-
-      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS - 1);
-
-      if (w == 4) {
-        store_u8_4x1(dst, d01, 0);
-      } else if (w == 2) {
-        store_u8_2x1(dst, d01, 0);
-      }
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s5 = s6;
-      s6 = s7;
-      dst += dst_stride;
-      h -= 1;
-#endif  // AOM_ARCH_AARCH64
-    } while (h > 0);
   } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-#if AOM_ARCH_AARCH64
-    uint8x8_t t1, t2, t3;
-    int16x8_t s8, s9, s10;
-#endif  // AOM_ARCH_AARCH64
-    do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      d = dst;
-      height = h;
-
-      do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-#if AOM_ARCH_AARCH64
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-        t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
-        t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
-        t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
-        if (h != 2) {
-          store_u8_8x4(d, dst_stride, t0, t1, t2, t3);
-        } else {
-          store_u8_8x2(d, dst_stride, t0, t1);
-        }
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        __builtin_prefetch(d);
-        __builtin_prefetch(s);
-
-        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        height -= 1;
-#endif  // AOM_ARCH_AARCH64
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
+    convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
   }
 }
 
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve12_horiz_4_usdot(uint8x16_t samples,
-                                                 const int8x16_t filters,
-                                                 const uint8x16x3_t permute_tbl,
-                                                 int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
-  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
-
-  /* Narrow and re-pack. */
-  return vshrn_n_s32(sum, ROUND0_BITS);
-}
-
-static INLINE int16x8_t convolve12_horiz_8_usdot(uint8x16_t samples0,
-                                                 uint8x16_t samples1,
-                                                 const int8x16_t filters,
-                                                 const uint8x16x3_t permute_tbl,
-                                                 const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[4];
-  int32x4_t sum[2];
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples0, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples0, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples0, permute_tbl.val[2]);
-  /* {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } */
-  permuted_samples[3] = vqtbl1q_u8(samples1, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
-  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
-  /* Second 4 output values. */
-  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
-  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
-
-  /* Narrow and re-pack. */
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
-                      vshrn_n_s32(sum[1], ROUND0_BITS));
-}
-
-static INLINE void convolve_2d_sr_horiz_12tap_neon(
-    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11) {
-  const int bd = 8;
-
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
-    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd - 1)));
-    // Undo the horizontal offset in the calling function.
-    src_ptr += 5;
-
-    for (int i = 0; i < h; i++) {
-      for (int j = 0; j < w; j += 8) {
-        uint8x8_t s0 = vld1_u8(src_ptr + i * src_stride + j);
-        uint16x8_t t0 = vaddw_u8(vreinterpretq_u16_s16(horiz_const), s0);
-        int16x8_t d0 =
-            vshlq_n_s16(vreinterpretq_s16_u16(t0), FILTER_BITS - ROUND0_BITS);
-        if (w == 2) {
-          store_s16_2x1(dst_ptr + i * dst_stride, vget_low_s16(d0), 0);
-        } else if (w == 4) {
-          vst1_s16(dst_ptr + i * dst_stride, vget_low_s16(d0));
-        } else {
-          vst1q_s16(dst_ptr + i * dst_stride + j, d0);
-        }
-      }
-    }
-  } else {
-    // Narrow filter values to 8-bit.
-    const int16x8x2_t x_filter_s16 = {
-      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-    };
-    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                           vmovn_s16(x_filter_s16.val[1]));
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-    // - which are generally faster than rounding shifts on modern CPUs.
-    const int32x4_t horiz_const =
-        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    if (w <= 4) {
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0, s1, s2, s3;
-          int16x4_t d0, d1, d2, d3;
-
-          load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-          d0 = convolve12_horiz_4_usdot(s0, x_filter, permute_tbl, horiz_const);
-          d1 = convolve12_horiz_4_usdot(s1, x_filter, permute_tbl, horiz_const);
-          d2 = convolve12_horiz_4_usdot(s2, x_filter, permute_tbl, horiz_const);
-          d3 = convolve12_horiz_4_usdot(s3, x_filter, permute_tbl, horiz_const);
-
-          if (w == 2) {
-            store_s16_2x1(d + 0 * dst_stride, d0, 0);
-            store_s16_2x1(d + 1 * dst_stride, d1, 0);
-            store_s16_2x1(d + 2 * dst_stride, d2, 0);
-            store_s16_2x1(d + 3 * dst_stride, d3, 0);
-          } else {
-            store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
-          }
-
-          s += 4;
-          d += 4;
-          width -= 4;
-        } while (width > 0);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h >= 4);
-
-      for (; h > 0; h--) {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0;
-          int16x4_t d0;
-
-          s0 = vld1q_u8(s);
-
-          d0 = convolve12_horiz_4_usdot(s0, x_filter, permute_tbl, horiz_const);
-
-          if (w == 2) {
-            store_s16_2x1(d, d0, 0);
-          } else {
-            vst1_s16(d, d0);
-          }
-
-          s += 4;
-          d += 4;
-          width -= 4;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      }
-    } else {
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          int16x8_t d0, d1, d2, d3;
-
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          d0 = convolve12_horiz_8_usdot(s0[0], s0[1], x_filter, permute_tbl,
-                                        horiz_const);
-          d1 = convolve12_horiz_8_usdot(s1[0], s1[1], x_filter, permute_tbl,
-                                        horiz_const);
-          d2 = convolve12_horiz_8_usdot(s2[0], s2[1], x_filter, permute_tbl,
-                                        horiz_const);
-          d3 = convolve12_horiz_8_usdot(s3[0], s3[1], x_filter, permute_tbl,
-                                        horiz_const);
-
-          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h >= 4);
-
-      for (; h > 0; h--) {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2];
-          int16x8_t d0;
-
-          s0[0] = vld1q_u8(s);
-          s0[1] = vld1q_u8(s + 4);
-
-          d0 = convolve12_horiz_8_usdot(s0[0], s0[1], x_filter, permute_tbl,
-                                        horiz_const);
-
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      }
-    }
-  }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void convolve_2d_sr_horiz_12tap_neon(
-    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11) {
-  const int bd = 8;
-
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
-    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd - 1)));
-    // Undo the horizontal offset in the calling function.
-    src_ptr += 5;
-
-    for (int i = 0; i < h; i++) {
-      for (int j = 0; j < w; j += 8) {
-        uint8x8_t s0 = vld1_u8(src_ptr + i * src_stride + j);
-        uint16x8_t t0 = vaddw_u8(vreinterpretq_u16_s16(horiz_const), s0);
-        int16x8_t d0 =
-            vshlq_n_s16(vreinterpretq_s16_u16(t0), FILTER_BITS - ROUND0_BITS);
-        if (w == 2) {
-          store_s16_2x1(dst_ptr + i * dst_stride, vget_low_s16(d0), 0);
-        } else if (w == 4) {
-          vst1_s16(dst_ptr + i * dst_stride, vget_low_s16(d0));
-        } else {
-          vst1q_s16(dst_ptr + i * dst_stride + j, d0);
-        }
-      }
-    }
-  } else {
-    // Narrow filter values to 8-bit.
-    const int16x8x2_t x_filter_s16 = {
-      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-    };
-    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                           vmovn_s16(x_filter_s16.val[1]));
-
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-    // - which are generally faster than rounding shifts on modern CPUs.
-    const int32_t horiz_const =
-        ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    // Dot product constants.
-    const int32x4_t correct_tmp =
-        vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
-                  vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
-    const int32x4_t correction =
-        vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
-    const uint8x16_t range_limit = vdupq_n_u8(128);
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    if (w <= 4) {
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0, s1, s2, s3;
-          int16x4_t d0, d1, d2, d3;
-
-          load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-          d0 = convolve12_horiz_4_sdot(s0, x_filter, correction, range_limit,
-                                       permute_tbl);
-          d1 = convolve12_horiz_4_sdot(s1, x_filter, correction, range_limit,
-                                       permute_tbl);
-          d2 = convolve12_horiz_4_sdot(s2, x_filter, correction, range_limit,
-                                       permute_tbl);
-          d3 = convolve12_horiz_4_sdot(s3, x_filter, correction, range_limit,
-                                       permute_tbl);
-
-          if (w == 2) {
-            store_s16_2x1(d + 0 * dst_stride, d0, 0);
-            store_s16_2x1(d + 1 * dst_stride, d1, 0);
-            store_s16_2x1(d + 2 * dst_stride, d2, 0);
-            store_s16_2x1(d + 3 * dst_stride, d3, 0);
-          } else {
-            store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
-          }
-
-          s += 4;
-          d += 4;
-          width -= 4;
-        } while (width > 0);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h >= 4);
-
-      for (; h > 0; h--) {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0;
-          int16x4_t d0;
-
-          s0 = vld1q_u8(s);
-
-          d0 = convolve12_horiz_4_sdot(s0, x_filter, correction, range_limit,
-                                       permute_tbl);
-
-          if (w == 2) {
-            store_s16_2x1(d, d0, 0);
-          } else {
-            vst1_s16(d, d0);
-          }
-
-          s += 4;
-          d += 4;
-          width -= 4;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      }
-    } else {
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          int16x8_t d0, d1, d2, d3;
-
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          d0 = convolve12_horiz_8_sdot(s0[0], s0[1], x_filter, correction,
-                                       range_limit, permute_tbl);
-          d1 = convolve12_horiz_8_sdot(s1[0], s1[1], x_filter, correction,
-                                       range_limit, permute_tbl);
-          d2 = convolve12_horiz_8_sdot(s2[0], s2[1], x_filter, correction,
-                                       range_limit, permute_tbl);
-          d3 = convolve12_horiz_8_sdot(s3[0], s3[1], x_filter, correction,
-                                       range_limit, permute_tbl);
-
-          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h >= 4);
-
-      for (; h > 0; h--) {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2];
-          int16x8_t d0;
-
-          s0[0] = vld1q_u8(s);
-          s0[1] = vld1q_u8(s + 4);
-
-          d0 = convolve12_horiz_8_sdot(s0[0], s0[1], x_filter, correction,
-                                       range_limit, permute_tbl);
-
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      }
-    }
-  }
-}
-
-#else  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE int16x4_t convolve12_horiz_4x4_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
-    const int32x4_t horiz_const) {
+static INLINE int16x4_t
+convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                  const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+                  const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+                  const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
+                  const int32x4_t horiz_const) {
   const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
-  int32x4_t sum;
 
-  sum = horiz_const;
+  int32x4_t sum = horiz_const;
   sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
   sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
   sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
@@ -2526,136 +1033,68 @@
   return vshrn_n_s32(sum, ROUND0_BITS);
 }
 
-// 4 column per iteration horizontal filtering for 12-tap convolve_2d_sr.
-// Processes one row at a time.
-static INLINE void horiz_filter_12tap_w4_single_row(
-    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11, const int32x4_t horiz_const) {
-  do {
-    const uint8_t *s = src_ptr;
-    int16_t *d = dst_ptr;
-    int width = w;
-
-    do {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, d0;
-      uint8x16_t t0;
-      int16x8_t tt0, tt1;
-
-      t0 = vld1q_u8(s);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
-
-      s0 = vget_low_s16(tt0);
-      s4 = vget_high_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s12 = vget_high_s16(tt1);
-
-      s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
-      s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
-      s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
-      s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
-      s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
-      s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
-      s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
-      s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
-      s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
-
-      d0 = convolve12_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                    s11, x_filter_0_7, x_filter_8_11,
-                                    horiz_const);
-
-      if (w == 2) {
-        store_s16_2x1(d, d0, 0);
-      } else {
-        vst1_s16(d, d0);
-      }
-
-      s += 4;
-      d += 4;
-      width -= 4;
-    } while (width > 0);
-
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-    h--;
-  } while (h > 0);
-}
-
 static INLINE void convolve_2d_sr_horiz_12tap_neon(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
   const int bd = 8;
-  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
   // which are generally faster than rounding shifts on modern CPUs.
   const int32x4_t horiz_const =
       vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
 
 #if AOM_ARCH_AARCH64
   do {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint8x8_t t0, t1, t2, t3;
-
     const uint8_t *s = src_ptr;
     int16_t *d = dst_ptr;
     int width = w;
 
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
 
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
     load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
-    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
 
-    s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
     s += 11;
 
     do {
-      int16x4_t s11, s12, s13, s14, d0, d1, d2, d3;
-
       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
 
-      s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-      d0 = convolve12_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                    s11, x_filter_0_7, x_filter_8_11,
-                                    horiz_const);
-      d1 = convolve12_horiz_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                    s11, s12, x_filter_0_7, x_filter_8_11,
-                                    horiz_const);
-      d2 = convolve12_horiz_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                    s12, s13, x_filter_0_7, x_filter_8_11,
-                                    horiz_const);
-      d3 = convolve12_horiz_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                    s13, s14, x_filter_0_7, x_filter_8_11,
-                                    horiz_const);
+      int16x4_t d0 =
+          convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                            x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d1 =
+          convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                            x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d2 =
+          convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                            x_filter_0_7, x_filter_8_11, horiz_const);
+      int16x4_t d3 =
+          convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+                            x_filter_0_7, x_filter_8_11, horiz_const);
 
-      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
-      if (w == 2) {
-        store_s16_2x1(d + 0 * dst_stride, d0, 0);
-        store_s16_2x1(d + 1 * dst_stride, d1, 0);
-        store_s16_2x1(d + 2 * dst_stride, d2, 0);
-        store_s16_2x1(d + 3 * dst_stride, d3, 0);
-      } else {
-        store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
-      }
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+      store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -2668,562 +1107,192 @@
       s8 = s12;
       s9 = s13;
       s10 = s14;
+      s += 4;
+      d += 4;
+      width -= 4;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    h -= 4;
+  } while (h > 4);
+#endif  // AOM_ARCH_AARCH64
+
+  do {
+    const uint8_t *s = src_ptr;
+    int16_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x16_t t0 = vld1q_u8(s);
+      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+
+      int16x4_t s0 = vget_low_s16(tt0);
+      int16x4_t s4 = vget_high_s16(tt0);
+      int16x4_t s8 = vget_low_s16(tt1);
+      int16x4_t s12 = vget_high_s16(tt1);
+
+      int16x4_t s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
+      int16x4_t s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
+      int16x4_t s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
+      int16x4_t s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
+      int16x4_t s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
+      int16x4_t s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
+      int16x4_t s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
+
+      int16x4_t d0 =
+          convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                            x_filter_0_7, x_filter_8_11, horiz_const);
+      vst1_s16(d, d0);
 
       s += 4;
       d += 4;
       width -= 4;
-    } while (width > 0);
-
-    src_ptr += 4 * src_stride;
-    dst_ptr += 4 * dst_stride;
-    h -= 4;
-  } while (h >= 4);
-
-  if (h) {
-    horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride,
-                                     w, h, x_filter_0_7, x_filter_8_11,
-                                     horiz_const);
-  }
-#else   // !AOM_ARCH_AARCH64
-  horiz_filter_12tap_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
-                                   h, x_filter_0_7, x_filter_8_11, horiz_const);
-#endif  // AOM_ARCH_AARCH64
-}
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
-    int im_h, const int16x8_t x_filter_s16) {
-  const int bd = 8;
-
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-
-  int height = im_h;
-
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
-  // This shim of  1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // The outermost -1 is needed because we halved the filter values.
-  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
-                                            (1 << ((ROUND0_BITS - 1) - 1)));
-
-  if (w <= 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int32x4_t t0, t1, t2, t3;
-    int16x4_t d0, d1, d2, d3;
-
-    do {
-      assert(height >= 4);
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
-      t1 = convolve8_4_usdot(s1, x_filter, permute_tbl, horiz_const);
-      t2 = convolve8_4_usdot(s2, x_filter, permute_tbl, horiz_const);
-      t3 = convolve8_4_usdot(s3, x_filter, permute_tbl, horiz_const);
-
-      // We halved the convolution filter values so -1 from the right shift.
-      d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
-      d1 = vshrn_n_s32(t1, ROUND0_BITS - 1);
-      d2 = vshrn_n_s32(t2, ROUND0_BITS - 1);
-      d3 = vshrn_n_s32(t3, ROUND0_BITS - 1);
-
-      if (w == 2) {
-        store_s16_2x1(dst_ptr + 0 * dst_stride, d0, 0);
-        store_s16_2x1(dst_ptr + 1 * dst_stride, d1, 0);
-        store_s16_2x1(dst_ptr + 2 * dst_stride, d2, 0);
-        store_s16_2x1(dst_ptr + 3 * dst_stride, d3, 0);
-      } else {
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-      }
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-
-      do {
-        s0 = vld1q_u8(src_ptr);
-        t0 = convolve8_4_usdot(s0, x_filter, permute_tbl, horiz_const);
-        // We halved the convolution filter values so -1 from the right shift.
-        d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
-
-        if (w == 2) {
-          store_s16_2x1(dst_ptr, d0, 0);
-        } else {
-          vst1_s16(dst_ptr, d0);
-        }
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        height--;
-      } while (height > 0);
-    }
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t d0, d1, d2, d3;
-
-    do {
-      assert(height >= 4);
-
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_horiz_8_usdot(s0, x_filter, permute_tbl, horiz_const);
-        d1 = convolve8_horiz_8_usdot(s1, x_filter, permute_tbl, horiz_const);
-        d2 = convolve8_horiz_8_usdot(s2, x_filter, permute_tbl, horiz_const);
-        d3 = convolve8_horiz_8_usdot(s3, x_filter, permute_tbl, horiz_const);
-
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          s0 = vld1q_u8(s);
-          d0 = convolve8_horiz_8_usdot(s0, x_filter, permute_tbl, horiz_const);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        height--;
-      } while (height > 0);
-    }
-  }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
-    int im_h, const int16x8_t x_filter_s16) {
-  const int bd = 8;
-
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-
-  int height = im_h;
-
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
-  // This shim of  1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // The outermost -1 is needed because we halved the filter values.
-  const int32_t horiz_const =
-      ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
-  // Dot product constants.
-  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
-  int32x4_t correction = vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
-  if (w <= 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int32x4_t t0, t1, t2, t3;
-    int16x4_t d0, d1, d2, d3;
-
-    do {
-      assert(height >= 4);
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, x_filter, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, x_filter, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, x_filter, correction, range_limit, permute_tbl);
-
-      // We halved the convolution filter values so -1 from the right shift.
-      d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
-      d1 = vshrn_n_s32(t1, ROUND0_BITS - 1);
-      d2 = vshrn_n_s32(t2, ROUND0_BITS - 1);
-      d3 = vshrn_n_s32(t3, ROUND0_BITS - 1);
-
-      if (w == 2) {
-        store_s16_2x1(dst_ptr + 0 * dst_stride, d0, 0);
-        store_s16_2x1(dst_ptr + 1 * dst_stride, d1, 0);
-        store_s16_2x1(dst_ptr + 2 * dst_stride, d2, 0);
-        store_s16_2x1(dst_ptr + 3 * dst_stride, d3, 0);
-      } else {
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-      }
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-
-      do {
-        s0 = vld1q_u8(src_ptr);
-        t0 = convolve8_4_sdot(s0, x_filter, correction, range_limit,
-                              permute_tbl);
-        // We halved the convolution filter values so -1 from the right shift.
-        d0 = vshrn_n_s32(t0, ROUND0_BITS - 1);
-
-        if (w == 2) {
-          store_s16_2x1(dst_ptr, d0, 0);
-        } else {
-          vst1_s16(dst_ptr, d0);
-        }
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        height--;
-      } while (height > 0);
-    }
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t d0, d1, d2, d3;
-
-    do {
-      assert(height >= 4);
-
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_horiz_8_sdot(s0, x_filter, correction, range_limit,
-                                    permute_tbl);
-        d1 = convolve8_horiz_8_sdot(s1, x_filter, correction, range_limit,
-                                    permute_tbl);
-        d2 = convolve8_horiz_8_sdot(s2, x_filter, correction, range_limit,
-                                    permute_tbl);
-        d3 = convolve8_horiz_8_sdot(s3, x_filter, correction, range_limit,
-                                    permute_tbl);
-
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          s0 = vld1q_u8(s);
-          d0 = convolve8_8_sdot(s0, x_filter, correction, range_limit,
-                                permute_tbl, vdupq_n_s16(0));
-          // We halved the convolution filter values so -1 from the right shift.
-          d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        height--;
-      } while (height > 0);
-    }
-  }
-}
-
-#else  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-// Horizontal filtering for convolve_2d_sr for width multiple of 8
-// Processes one row at a time
-static INLINE void horiz_filter_w8_single_row(const uint8_t *src_ptr,
-                                              int src_stride, int16_t *dst_ptr,
-                                              const int dst_stride, int width,
-                                              int height,
-                                              const int16x8_t x_filter,
-                                              const int16x8_t horiz_const) {
-  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-  do {
-    uint8x8_t t0 = vld1_u8(src_ptr);
-    s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
-
-    int width_tmp = width;
-    const uint8_t *s = src_ptr + 8;
-    int16_t *dst_tmp = dst_ptr;
-
-    __builtin_prefetch(dst_ptr);
-
-    do {
-      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-      s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      int16x8_t sum = s0;
-      s0 = s7;
-
-      s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-      s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-      s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-      s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-      s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-      s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-      s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-      int16x8_t res0 = convolve8_horiz_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
-                                               x_filter, horiz_const);
-
-      vst1q_s16(dst_tmp, res0);
-
-      s += 8;
-      dst_tmp += 8;
-      width_tmp -= 8;
-    } while (width_tmp > 0);
+    } while (width != 0);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    height--;
-  } while (height > 0);
+  } while (--h != 0);
 }
 
-// Horizontal filtering for convolve_2d_sr for width <= 4
-// Processes one row at a time
-static INLINE void horiz_filter_w4_single_row(const uint8_t *src_ptr,
-                                              int src_stride, int16_t *dst_ptr,
-                                              const int dst_stride, int width,
-                                              int height,
-                                              const int16x8_t x_filter,
-                                              const int16x4_t horiz_const) {
-  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-  do {
-    const uint8_t *s = src_ptr;
+static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t filter,
+                                         const int16x4_t horiz_const) {
+  int16x4_t sum = horiz_const;
+  sum = vmla_lane_s16(sum, s0, filter, 0);
+  sum = vmla_lane_s16(sum, s1, filter, 1);
+  sum = vmla_lane_s16(sum, s2, filter, 2);
+  sum = vmla_lane_s16(sum, s3, filter, 3);
 
-    __builtin_prefetch(s);
-
-    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
-    int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    s0 = vget_low_s16(tt0);
-    s4 = vget_high_s16(tt0);
-
-    __builtin_prefetch(dst_ptr);
-    s += 8;
-
-    t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-    s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-    s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-    s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-    s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-    s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
-    s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
-    s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
-
-    int16x4_t d0 = convolve8_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                           x_filter, horiz_const);
-
-    if (width == 2) {
-      store_s16_2x1(dst_ptr, d0, 0);
-    } else {
-      vst1_s16(dst_ptr, d0);
-    }
-
-    dst_ptr += dst_stride;
-    src_ptr += src_stride;
-    height--;
-  } while (height > 0);
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshr_n_s16(sum, ROUND0_BITS - 1);
 }
 
-static INLINE void convolve_2d_sr_horiz_8tap_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
-    int im_h, const int16x8_t x_filter_s16) {
+static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t s6, const int16x8_t s7,
+                                         const int16x8_t filter,
+                                         const int16x8_t horiz_const) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = horiz_const;
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
+                                             int16_t *im_block, int im_stride,
+                                             int w, int im_h,
+                                             const int16_t *x_filter_ptr) {
   const int bd = 8;
 
   const uint8_t *src_ptr = src;
   int16_t *dst_ptr = im_block;
   int dst_stride = im_stride;
-
   int height = im_h;
 
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int16x8_t x_filter = vshrq_n_s16(x_filter_s16, 1);
-
   if (w <= 4) {
-    // This shim of  1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
     // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // The outermost -1 is needed because we halved the filter values.
+    // (The extra -1 is needed because we halved the filter values.)
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
                                              (1 << ((ROUND0_BITS - 1) - 1)));
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
 
-#if AOM_ARCH_AARCH64
+    src_ptr += 2;
+
     do {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-      uint8x8_t t0, t1, t2, t3;
-      const uint8_t *s = src_ptr;
+      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
 
-      assert(height >= 4);
+      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
 
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
 
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      vst1_s16(dst_ptr, d0);
 
-      s += 7;
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_horiz_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                   horiz_const);
-      d1 = convolve8_horiz_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                                   horiz_const);
-      d2 = convolve8_horiz_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                                   horiz_const);
-      d3 = convolve8_horiz_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                                   horiz_const);
-
-      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
-      if (w == 2) {
-        store_s16_2x1(dst_ptr + 0 * dst_stride, d0, 0);
-        store_s16_2x1(dst_ptr + 1 * dst_stride, d1, 0);
-        store_s16_2x1(dst_ptr + 2 * dst_stride, d2, 0);
-        store_s16_2x1(dst_ptr + 3 * dst_stride, d3, 0);
-      } else {
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-      }
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
-                                 height, x_filter, horiz_const);
-    }
-
-#else   // !AOM_ARCH_AARCH64
-    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
-                               height, x_filter, horiz_const);
-#endif  // AOM_ARCH_AARCH64
-
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
   } else {
-    // This shim of  1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
     // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // The outermost -1 is needed because we halved the filter values.
+    // (The extra -1 is needed because we halved the filter values.)
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
                                               (1 << ((ROUND0_BITS - 1) - 1)));
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
 #if AOM_ARCH_AARCH64
-
-    for (; height >= 8; height -= 8) {
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          d0, d1, d2, d3, d4, d5, d6, d7;
-      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
+    while (height > 8) {
       const uint8_t *s = src_ptr;
       int16_t *d = dst_ptr;
       int width = w;
 
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
       load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
       s += 7;
 
       do {
         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        d0 = convolve8_horiz_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                                     horiz_const);
-        d1 = convolve8_horiz_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                                     horiz_const);
-        d2 = convolve8_horiz_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                                     horiz_const);
-        d3 = convolve8_horiz_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                                     horiz_const);
-        d4 = convolve8_horiz_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                                     horiz_const);
-        d5 = convolve8_horiz_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                     x_filter, horiz_const);
-        d6 = convolve8_horiz_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                     x_filter, horiz_const);
-        d7 = convolve8_horiz_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                     x_filter, horiz_const);
+        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        x_filter, horiz_const);
+        int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        x_filter, horiz_const);
+        int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        x_filter, horiz_const);
+        int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        x_filter, horiz_const);
+        int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+                                        x_filter, horiz_const);
+        int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+                                        x_filter, horiz_const);
+        int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+                                        x_filter, horiz_const);
+        int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+                                        x_filter, horiz_const);
 
-        transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
         store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
@@ -3237,654 +1306,46 @@
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
-
+      } while (width != 0);
       src_ptr += 8 * src_stride;
       dst_ptr += 8 * dst_stride;
+      height -= 8;
     }
+#endif  // AOM_ARCH_AARCH64
 
-    for (; height >= 4; height -= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          dd0, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-      int16x8_t d0, d1, d2, d3;
-      uint8x8_t t0, t1, t2, t3;
-
+    do {
       const uint8_t *s = src_ptr;
       int16_t *d = dst_ptr;
       int width = w;
 
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      s += 7;
+      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
 
       do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+        uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
 
-        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-        s11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        s14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        dd0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
-        dd1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
-        dd2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
-        dd3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
-        dd4 = convolve8_4x4(s4, s5, s6, s7, s8, s9, s10, s11, x_filter);
-        dd5 = convolve8_4x4(s5, s6, s7, s8, s9, s10, s11, s12, x_filter);
-        dd6 = convolve8_4x4(s6, s7, s8, s9, s10, s11, s12, s13, x_filter);
-        dd7 = convolve8_4x4(s7, s8, s9, s10, s11, s12, s13, s14, x_filter);
+        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        x_filter, horiz_const);
 
-        transpose_s16_4x8(&dd0, &dd1, &dd2, &dd3, &dd4, &dd5, &dd6, &dd7, &d0,
-                          &d1, &d2, &d3);
-
-        d0 = vaddq_s16(d0, horiz_const);
-        d1 = vaddq_s16(d1, horiz_const);
-        d2 = vaddq_s16(d2, horiz_const);
-        d3 = vaddq_s16(d3, horiz_const);
-
-        // We halved the convolution filter values so -1 from the right shift.
-        d0 = vshrq_n_s16(d0, ROUND0_BITS - 1);
-        d1 = vshrq_n_s16(d1, ROUND0_BITS - 1);
-        d2 = vshrq_n_s16(d2, ROUND0_BITS - 1);
-        d3 = vshrq_n_s16(d3, ROUND0_BITS - 1);
-
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+        vst1q_s16(d, d0);
 
         s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-    }
-
-    if (height) {
-      assert(height < 4);
-      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
-                                 height, x_filter, horiz_const);
-    }
-
-#else   // !AOM_ARCH_AARCH64
-    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
-                               height, x_filter, horiz_const);
-#endif  // AOM_ARCH_AARCH64
-  }
-}
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int32x4_t convolve12_vert_4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-  int32x4_t sum;
-
-  sum = vmull_lane_s16(s0, y_filter_0_3, 0);
-  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
-  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
-  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
-  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
-  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
-  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
-  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
-  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
-  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
-  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
-  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
-
-  return sum;
-}
-
-static INLINE uint8x8_t convolve12_vert_8_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
-    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int16x8_t sub_const) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-  int32x4_t sum0, sum1;
-  int16x8_t res;
-
-  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
-
-  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
-
-  res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
-                     vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
-  res = vsubq_s16(res, sub_const);
-
-  return vqmovun_s16(res);
-}
-
-static INLINE void convolve_2d_sr_vert_12tap_neon(
-    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
-    int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
-  const int bd = 8;
-  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
-
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-    int32x4_t d0, d1, d2, d3;
-    int16x8_t dd01, dd23;
-    uint8x8_t d01, d23;
-
-    load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
-                  &s8, &s9, &s10);
-    src_ptr += 11 * src_stride;
-
-    do {
-      load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
-
-      d0 = convolve12_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                 s11, y_filter_0_7, y_filter_8_11);
-      d1 = convolve12_vert_4_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                 s12, y_filter_0_7, y_filter_8_11);
-      d2 = convolve12_vert_4_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                 s13, y_filter_0_7, y_filter_8_11);
-      d3 = convolve12_vert_4_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                                 s14, y_filter_0_7, y_filter_8_11);
-
-      dd01 = vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
-                          vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
-      dd23 = vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
-                          vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
-
-      dd01 = vsubq_s16(dd01, sub_const);
-      dd23 = vsubq_s16(dd23, sub_const);
-
-      d01 = vqmovun_s16(dd01);
-      d23 = vqmovun_s16(dd23);
-
-      if (w == 2) {
-        store_u8_2x1(dst_ptr + 0 * dst_stride, d01, 0);
-        store_u8_2x1(dst_ptr + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(dst_ptr + 2 * dst_stride, d23, 0);
-          store_u8_2x1(dst_ptr + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
-        store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
-          store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
-        }
-      }
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      s7 = s11;
-      s8 = s12;
-      s9 = s13;
-      s10 = s14;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-
-  } else {
-    do {
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-      uint8x8_t d0, d1, d2, d3;
-
-      int16_t *s = src_ptr;
-      uint8_t *d = dst_ptr;
-
-      int height = h;
-
-      load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
-                    &s9, &s10);
-      s += 11 * src_stride;
-
-      do {
-        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
-
-        d0 = convolve12_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                   s11, y_filter_0_7, y_filter_8_11, sub_const);
-        d1 = convolve12_vert_8_s32(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                   s12, y_filter_0_7, y_filter_8_11, sub_const);
-        d2 =
-            convolve12_vert_8_s32(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                  s13, y_filter_0_7, y_filter_8_11, sub_const);
-        d3 = convolve12_vert_8_s32(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                                   s13, s14, y_filter_0_7, y_filter_8_11,
-                                   sub_const);
-
-        if (h != 2) {
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-        } else {
-          store_u8_8x2(d, dst_stride, d0, d1);
-        }
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s7 = s11;
-        s8 = s12;
-        s9 = s13;
-        s10 = s14;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
-                                                 int src_stride,
-                                                 uint8_t *dst_ptr,
-                                                 int dst_stride, int w, int h,
-                                                 const int16x8_t y_filter) {
-  const int bd = 8;
-  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
-
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-    uint8x8_t d01;
-
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10, d1, d2, d3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    int16_t *s = src_ptr;
-    uint8_t *d = dst_ptr;
-
-    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    s += 7 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-      d0 = convolve8_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-      d1 = convolve8_vert_4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
-      d2 = convolve8_vert_4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
-      d3 = convolve8_vert_4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
-
-      d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
-      d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
-
-      if (w == 2) {
-        store_u8_2x1(d + 0 * dst_stride, d01, 0);
-        store_u8_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(d + 2 * dst_stride, d23, 0);
-          store_u8_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(d + 0 * dst_stride, d01, 0);
-        store_u8_4x1(d + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(d + 2 * dst_stride, d23, 0);
-          store_u8_4x1(d + 3 * dst_stride, d23, 1);
-        }
-      }
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      s += 4 * src_stride;
-      d += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s7 = vld1_s16(s);
-      s += src_stride;
-
-      d0 = convolve8_vert_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
-
-      d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
-
-      if (w == 2) {
-        store_u8_2x1(d, d01, 0);
-      } else {
-        store_u8_4x1(d, d01, 0);
-      }
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s5 = s6;
-      s6 = s7;
-      d += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h > 0);
-  } else {
-    // if width is a multiple of 8 & height is a multiple of 4
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint8x8_t d0;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10;
-    uint8x8_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int height = h;
-      int16_t *s = src_ptr;
-      uint8_t *d = dst_ptr;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        d0 = convolve8_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                  sub_const);
-        d1 = convolve8_vert_8_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                  sub_const);
-        d2 = convolve8_vert_8_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                  sub_const);
-        d3 = convolve8_vert_8_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                  sub_const);
-
-        if (h != 2) {
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-        } else {
-          store_u8_8x2(d, dst_stride, d0, d1);
-        }
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vld1q_s16(s);
-
-        d0 = convolve8_vert_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                  sub_const);
-
-        vst1_u8(d, d0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height > 0);
-
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-static INLINE int16x4_t
-convolve6_vert_4_s32(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                     const int16x8_t y_filter) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-  int32x4_t sum;
-
-  sum = vmull_lane_s16(s0, y_filter_lo, 1);
-  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
-  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
-  sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
-  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
-  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
-
-  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE uint8x8_t
-convolve6_vert_8_s32(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                     const int16x8_t y_filter, const int16x8_t sub_const) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-  int32x4_t sum0, sum1;
-  int16x8_t res;
-
-  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
-
-  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
-
-  res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
-                     vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
-  res = vsubq_s16(res, sub_const);
-
-  return vqmovun_s16(res);
-}
-
-static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
-                                                 int src_stride,
-                                                 uint8_t *dst_ptr,
-                                                 int dst_stride, int w, int h,
-                                                 const int16x8_t y_filter) {
-  const int bd = 8;
-  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
-
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, d0;
-    uint8x8_t d01;
-
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8, d1, d2, d3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    int16_t *s = src_ptr;
-    uint8_t *d = dst_ptr;
-
-    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
-    s += 5 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
-
-      d0 = convolve6_vert_4_s32(s0, s1, s2, s3, s4, s5, y_filter);
-      d1 = convolve6_vert_4_s32(s1, s2, s3, s4, s5, s6, y_filter);
-      d2 = convolve6_vert_4_s32(s2, s3, s4, s5, s6, s7, y_filter);
-      d3 = convolve6_vert_4_s32(s3, s4, s5, s6, s7, s8, y_filter);
-
-      d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
-      d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
-
-      if (w == 2) {
-        store_u8_2x1(d + 0 * dst_stride, d01, 0);
-        store_u8_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u8_2x1(d + 2 * dst_stride, d23, 0);
-          store_u8_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        store_u8_4x1(d + 0 * dst_stride, d01, 0);
-        store_u8_4x1(d + 1 * dst_stride, d01, 1);
-        if (h != 2) {
-          store_u8_4x1(d + 2 * dst_stride, d23, 0);
-          store_u8_4x1(d + 3 * dst_stride, d23, 1);
-        }
-      }
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s += 4 * src_stride;
-      d += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s5 = vld1_s16(s);
-
-      d0 = convolve6_vert_4_s32(s0, s1, s2, s3, s4, s5, y_filter);
-      d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
-
-      if (w == 2) {
-        store_u8_2x1(d, d01, 0);
-      } else {
-        store_u8_4x1(d, d01, 0);
-      }
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s += src_stride;
-      d += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h > 0);
-  } else {
-    // if width is a multiple of 8 & height is a multiple of 4
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint8x8_t d0;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8;
-    uint8x8_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int height = h;
-      int16_t *s = src_ptr;
-      uint8_t *d = dst_ptr;
-
-      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
-        d0 = convolve6_vert_8_s32(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
-        d1 = convolve6_vert_8_s32(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
-        d2 = convolve6_vert_8_s32(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
-        d3 = convolve6_vert_8_s32(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
-
-        if (h != 2) {
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-        } else {
-          store_u8_8x2(d, dst_stride, d0, d1);
-        }
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vld1q_s16(s);
-
-        d0 = convolve6_vert_8_s32(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
-
-        vst1_u8(d, d0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height > 0);
-
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w > 0);
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
   }
 }
 
@@ -3894,7 +1355,13 @@
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
-  (void)conv_params;
+  if (w == 2 || h == 2) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
+
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   const int im_h = h + clamped_y_taps - 1;
@@ -3924,14 +1391,13 @@
                                    y_filter_0_7, y_filter_8_11);
   } else {
     DECLARE_ALIGNED(16, int16_t,
-                    im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
 
-    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+    convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
+                              x_filter_ptr);
+
     const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-    convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride, w,
-                                   im_h, x_filter);
-
     if (clamped_y_taps <= 6) {
       convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
                                     y_filter);
@@ -3942,291 +1408,259 @@
   }
 }
 
-static INLINE void scaledconvolve_horiz_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
+void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const int subpel_x_qn,
+                                    ConvolveParams *conv_params) {
+  assert(subpel_x_qn == 8);
+  assert(filter_params_x->taps == 2);
+  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
 
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  y = h;
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
+  if (w <= 4) {
     do {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8], d;
-          int16x8_t ss[4];
-          int16x4_t t[8], tt;
+      uint8x8_t s0_0 = vld1_u8(src);
+      uint8x8_t s0_1 = vld1_u8(src + 1);
+      uint8x8_t s1_0 = vld1_u8(src + src_stride);
+      uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
 
-          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
-          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+      uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
+      uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
 
-          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-          t[0] = vget_low_s16(ss[0]);
-          t[1] = vget_low_s16(ss[1]);
-          t[2] = vget_low_s16(ss[2]);
-          t[3] = vget_low_s16(ss[3]);
-          t[4] = vget_high_s16(ss[0]);
-          t[5] = vget_high_s16(ss[1]);
-          t[6] = vget_high_s16(ss[2]);
-          t[7] = vget_high_s16(ss[3]);
-
-          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters);
-          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-          store_u8_4x1(&temp[4 * z], d, 0);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
+      if (w == 2) {
+        store_u8_2x1(dst + 0 * dst_stride, d0, 0);
+        store_u8_2x1(dst + 1 * dst_stride, d1, 0);
+      } else {
+        store_u8_4x1(dst + 0 * dst_stride, d0, 0);
+        store_u8_4x1(dst + 1 * dst_stride, d1, 0);
       }
 
-      // transpose the 4x4 filters values back to dst
-      {
-        const uint8x8x4_t d4 = vld4_u8(temp);
-        store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0], 0);
-        store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1], 0);
-        store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2], 0);
-        store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3], 0);
-      }
-      x += 4;
-    } while (x < w);
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    y -= 4;
-  } while (y > 0);
-}
-
-static INLINE void scaledconvolve_horiz_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas. The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = (h + 7) & ~7;
-
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 8) {
     do {
-      uint8x8_t d[8];
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      uint8x8_t s0_0 = vld1_u8(src);
+      uint8x8_t s0_1 = vld1_u8(src + 1);
+      uint8x8_t s1_0 = vld1_u8(src + src_stride);
+      uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
 
-        if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8];
-          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
-                      &s[5], &s[6], &s[7]);
-          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                           &s[7]);
-          d[0] = scale_filter_8(s, filters);
-          vst1_u8(&temp[8 * z], d[0]);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
+      uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
+      uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
 
-      // transpose the 8x8 filters values back to dst
-      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
-                  &d[7]);
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
-      store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
-                   d[6], d[7]);
-      x += 8;
-    } while (x < w);
+      vst1_u8(dst, d0);
+      vst1_u8(dst + dst_stride, d1);
 
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static INLINE void scaledconvolve_vert_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      int16x4_t t[8], tt;
-
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
-      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
-      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
-      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
-      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
-      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
-      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
-      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
-      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-      store_u8_4x1(dst, d, 0);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
-
-static INLINE void scaledconvolve_vert_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      d = scale_filter_8(s, filters);
-      vst1_u8(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
-
-static INLINE void scaledconvolve_vert_w16(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int x, y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      x = 0;
-      do {
-        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-        uint8x16_t ss[8];
-        uint8x8_t s[8], d[2];
-        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
-                     &ss[5], &ss[6], &ss[7]);
-        s[0] = vget_low_u8(ss[0]);
-        s[1] = vget_low_u8(ss[1]);
-        s[2] = vget_low_u8(ss[2]);
-        s[3] = vget_low_u8(ss[3]);
-        s[4] = vget_low_u8(ss[4]);
-        s[5] = vget_low_u8(ss[5]);
-        s[6] = vget_low_u8(ss[6]);
-        s[7] = vget_low_u8(ss[7]);
-        d[0] = scale_filter_8(s, filters);
-
-        s[0] = vget_high_u8(ss[0]);
-        s[1] = vget_high_u8(ss[1]);
-        s[2] = vget_high_u8(ss[2]);
-        s[3] = vget_high_u8(ss[3]);
-        s[4] = vget_high_u8(ss[4]);
-        s[5] = vget_high_u8(ss[5]);
-        s[6] = vget_high_u8(ss[6]);
-        s[7] = vget_high_u8(ss[7]);
-        d[1] = scale_filter_8(s, filters);
-        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
-        src_y += 16;
-        x += 16;
-      } while (x < w);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
-
-void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const InterpKernel *filter,
-                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
-                        int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  // When calling in frame scaling function, the smallest scaling factor is x1/4
-  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
-  // big enough.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
-  assert(x_step_q4 <= 64);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
   } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
+    do {
+      const uint8_t *src_ptr = src;
+      uint8_t *dst_ptr = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(src_ptr);
+        uint8x16_t s1 = vld1q_u8(src_ptr + 1);
+
+        uint8x16_t d0 = vrhaddq_u8(s0, s1);
+
+        vst1q_u8(dst_ptr, d0);
+
+        src_ptr += 16;
+        dst_ptr += 16;
+        width -= 16;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_y_qn) {
+  assert(subpel_y_qn == 8);
+  assert(filter_params_y->taps == 2);
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  if (w <= 4) {
+    do {
+      uint8x8_t s0 = load_unaligned_u8_4x1(src);
+      uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
+      uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
+
+      uint8x8_t d0 = vrhadd_u8(s0, s1);
+      uint8x8_t d1 = vrhadd_u8(s1, s2);
+
+      if (w == 2) {
+        store_u8_2x1(dst + 0 * dst_stride, d0, 0);
+        store_u8_2x1(dst + 1 * dst_stride, d1, 0);
+      } else {
+        store_u8_4x1(dst + 0 * dst_stride, d0, 0);
+        store_u8_4x1(dst + 1 * dst_stride, d1, 0);
+      }
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + src_stride);
+      uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
+
+      uint8x8_t d0 = vrhadd_u8(s0, s1);
+      uint8x8_t d1 = vrhadd_u8(s1, s2);
+
+      vst1_u8(dst, d0);
+      vst1_u8(dst + dst_stride, d1);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *src_ptr = src;
+      uint8_t *dst_ptr = dst;
+      int height = h;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(src_ptr);
+        uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
+
+        uint8x16_t d0 = vrhaddq_u8(s0, s1);
+
+        vst1q_u8(dst_ptr, d0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      } while (--height != 0);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    } while (w != 0);
+  }
+}
+
+void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn,
+                                     const int subpel_y_qn,
+                                     ConvolveParams *conv_params) {
+  assert(subpel_x_qn == 8);
+  assert(subpel_y_qn == 8);
+  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + 1;
+  int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+
+  uint16_t *im = im_block;
+
+  // Horizontal filter.
+  if (w <= 4) {
+    do {
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+
+      uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
+
+      // Safe to store the whole vector, the im buffer is big enough.
+      vst1_u16(im, sum);
+
+      src += src_stride;
+      im += im_stride;
+    } while (--im_h != 0);
+  } else {
+    do {
+      const uint8_t *src_ptr = src;
+      uint16_t *im_ptr = im;
+      int width = w;
+
+      do {
+        uint8x8_t s0 = vld1_u8(src_ptr);
+        uint8x8_t s1 = vld1_u8(src_ptr + 1);
+
+        uint16x8_t sum = vaddl_u8(s0, s1);
+
+        vst1q_u16(im_ptr, sum);
+
+        src_ptr += 8;
+        im_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+      src += src_stride;
+      im += im_stride;
+    } while (--im_h != 0);
   }
 
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  im = im_block;
+
+  // Vertical filter.
+  if (w <= 4) {
+    do {
+      uint16x4_t s0 = vld1_u16(im);
+      uint16x4_t s1 = vld1_u16(im + im_stride);
+      uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
+
+      uint16x4_t sum0 = vadd_u16(s0, s1);
+      uint16x4_t sum1 = vadd_u16(s1, s2);
+
+      uint8x8_t d01 = vqrshrn_n_u16(vcombine_u16(sum0, sum1), 2);
+
+      if (w == 2) {
+        store_u8_2x1(dst + 0 * dst_stride, d01, 0);
+        store_u8_2x1(dst + 1 * dst_stride, d01, 2);
+      } else {
+        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+        store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      }
+
+      im += 2 * im_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
   } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+    do {
+      uint16_t *im_ptr = im;
+      uint8_t *dst_ptr = dst;
+      int height = h;
+
+      do {
+        uint16x8_t s0 = vld1q_u16(im_ptr);
+        uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
+
+        uint16x8_t sum = vaddq_u16(s0, s1);
+        uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
+
+        vst1_u8(dst_ptr, d0);
+
+        im_ptr += im_stride;
+        dst_ptr += dst_stride;
+      } while (--height != 0);
+      im += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 14a6ebe..6b8edf8 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -15,549 +15,530 @@
 
 #include "config/aom_config.h"
 
-#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
 
-static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t s4, const int16x4_t s5,
-                                    const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x4_t sum;
-
-  sum = vmul_lane_s16(s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
-  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
-  return sum;
-}
-
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x8_t s4, const int16x8_t s5,
-                                    const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filter) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x8_t sum;
-
-  sum = vmulq_lane_s16(s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
-  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
-  return vqrshrun_n_s16(sum, 7);
-}
-
-static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
-                                       const int16x8_t filter) {
-  int16x8_t ss[8];
-
-  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
-  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
-  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
-  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
-
-  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filter);
-}
-
-static INLINE uint8x8_t wiener_convolve8_vert_4x8(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, int16_t *filter_y, const int bd,
-    const int round1_bits) {
-  int16x8_t ss0, ss1, ss2;
-  int32x4_t sum0, sum1;
-  int16x8_t tmp;
-  uint8x8_t res;
-
-  const int32_t round_const = (1 << (bd + round1_bits - 1));
-  const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
-  const int32x4_t round_vec = vdupq_n_s32(round_const);
-  const int16x4_t filter = vld1_s16(filter_y);
-
-  ss0 = vaddq_s16(s0, s6);
-  ss1 = vaddq_s16(s1, s5);
-  ss2 = vaddq_s16(s2, s4);
-
-  sum0 = vmull_lane_s16(vget_low_s16(ss0), filter, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss1), filter, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss2), filter, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
-
-  sum1 = vmull_lane_s16(vget_high_s16(ss0), filter, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss1), filter, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss2), filter, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
-
-  sum0 = vsubq_s32(sum0, round_vec);
-  sum1 = vsubq_s32(sum1, round_vec);
-
-  /* right shift & rounding */
-  sum0 = vrshlq_s32(sum0, round_bits);
-  sum1 = vrshlq_s32(sum1, round_bits);
-
-  /* from int32x4_t to uint8x8_t */
-  tmp = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  res = vqmovun_s16(tmp);
-
-  return res;
-}
-
-static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, int16_t *filter_x, const int bd,
-    const int round0_bits) {
-  int16x8_t sum;
-  uint16x8_t res;
-  int32x4_t sum_0, sum_1;
-  int32x4_t s3_0, s3_1;
-  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
-  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
-
-  /* for the purpose of right shift by { conv_params->round_0 } */
-  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
-
-  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
-  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
-  const int16x4_t filter = vld1_s16(filter_x);
-
-  sum = vmulq_lane_s16(s0, filter, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter, 2);
-
-  /* sum from 16x8 to 2 32x4 registers */
-  sum_0 = vmovl_s16(vget_low_s16(sum));
-  sum_1 = vmovl_s16(vget_high_s16(sum));
-
-  /* s[3]*128 -- and filter coef max can be 128
-   *  then max value possible = 128*128*255 exceeding 16 bit
-   */
-
-  s3_0 = vmull_lane_s16(vget_low_s16(s3), filter, 3);
-  s3_1 = vmull_lane_s16(vget_high_s16(s3), filter, 3);
-  sum_0 = vaddq_s32(sum_0, s3_0);
-  sum_1 = vaddq_s32(sum_1, s3_1);
-
-  /* Add the constant value */
-  sum_0 = vaddq_s32(sum_0, round_vec_0);
-  sum_1 = vaddq_s32(sum_1, round_vec_0);
-
-  /* right shift & rounding & saturating */
-  sum_0 = vqrshlq_s32(sum_0, round_bits);
-  sum_1 = vqrshlq_s32(sum_1, round_bits);
-
-  /* Clipping to max value */
-  sum_0 = vminq_s32(sum_0, round_vec_1);
-  sum_1 = vminq_s32(sum_1, round_vec_1);
-
-  res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
-  return res;
-}
-
-static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, int16_t *filter_x, const int bd,
-    const int round0_bits) {
-  uint16x4_t res;
-  int32x4_t sum_0, s3_0;
-  int16x4_t sum, temp0, temp1, temp2;
-
-  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
-  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
-  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
-  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
-  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
-  const int16x4_t filter = vld1_s16(filter_x);
-
-  temp0 = vadd_s16(s0, s6);
-  temp1 = vadd_s16(s1, s5);
-  temp2 = vadd_s16(s2, s4);
-
-  sum = vmul_lane_s16(temp0, filter, 0);
-  sum = vmla_lane_s16(sum, temp1, filter, 1);
-  sum = vmla_lane_s16(sum, temp2, filter, 2);
-  sum_0 = vmovl_s16(sum);
-
-  /* s[3]*128 -- and filter coff max can be 128.
-   * then max value possible = 128*128*255 Therefore, 32 bits are required to
-   * hold the result.
-   */
-  s3_0 = vmull_lane_s16(s3, filter, 3);
-  sum_0 = vaddq_s32(sum_0, s3_0);
-
-  sum_0 = vaddq_s32(sum_0, round_vec_0);
-  sum_0 = vrshlq_s32(sum_0, round_bits);
-
-  sum_0 = vminq_s32(sum_0, round_vec_1);
-  res = vqmovun_s32(sum_0);
-  return res;
-}
-
-static INLINE int16x8_t convolve8_8x8_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
-    const int16x8_t horiz_const, const int16x8_t shift_round_0) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x8_t sum;
-
-  sum = horiz_const;
-  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
-
-  sum = vqrshlq_s16(sum, shift_round_0);
-
-  return sum;
-}
-
-// clang versions < 16 did not include the dotprod feature for Arm architecture
-// versions that should have it by default, e.g., armv8.6-a.
-#if AOM_ARCH_AARCH64 && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x8_t convolve8_x_8_usdot(uint8x16_t samples,
-                                            const int8x8_t filters,
-                                            const uint8x16x3_t permute_tbl,
-                                            const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
-  return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
-}
-
-static INLINE int16x8_t convolve8_horiz_8_usdot(uint8x16_t samples,
-                                                const int8x8_t filters,
-                                                const uint8x16x3_t permute_tbl,
-                                                const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  // We halved the convolution filter values so -1 from the right shift.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x2_t permute_tbl,
-                                          const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  /* First 4 output values. */
-  sum = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x8_t convolve8_horiz_8_sdot(uint8x16_t samples,
-                                               const int8x8_t filters,
-                                               const int32x4_t correction,
-                                               const uint8x16_t range_limit,
-                                               const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum[2];
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  /* We halved the convolution filter values so -1 from the right shift. */
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
-}
-
-static INLINE int16x8_t convolve8_8_sdot(uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl,
-                                         const int16x8_t shift_round_0) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  return vqrshlq_s16(sum, shift_round_0);
-}
-
-static INLINE int16x8_t convolve8_x_8_sdot(uint8x16_t samples,
-                                           const int8x8_t filters,
-                                           const int32x4_t correction,
-                                           const uint8x16_t range_limit,
-                                           const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum[2];
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  return vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
-}
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x4_t convolve8_4x4_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
-    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x4_t sum;
-
-  sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
-
-  sum = vqrshl_s16(sum, shift_round_0);
-
-  return sum;
-}
-
-static INLINE int16x4_t convolve6_4x4(const int16x4_t s0, const int16x4_t s1,
-                                      const int16x4_t s2, const int16x4_t s3,
-                                      const int16x4_t s4, const int16x4_t s5,
-                                      const int16x8_t y_filter_0_7) {
+static INLINE int32x4_t
+convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                  const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+                  const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+                  const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-  int16x4_t sum;
 
-  // Filter values at indices 0 and 7 are 0.
-  sum = vmul_lane_s16(s0, y_filter_0_3, 1);
-  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
-  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
-  sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
-  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
-  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
 
   return sum;
 }
 
-static INLINE int16x8_t convolve6_8x4(const int16x8_t s0, const int16x8_t s1,
-                                      const int16x8_t s2, const int16x8_t s3,
-                                      const int16x8_t s4, const int16x8_t s5,
-                                      const int16x8_t y_filters) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filters);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filters);
-  int16x8_t sum;
+static INLINE uint8x8_t
+convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                  const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+                  const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+                  const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+                  const int16x8_t sub_const) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
 
-  // Filter values at indices 0 and 7 are 0.
-  sum = vmulq_lane_s16(s0, y_filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
-  sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
-  sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
 
-  return sum;
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+  int16x8_t res =
+      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+  res = vsubq_s16(res, sub_const);
+
+  return vqmovun_s16(res);
 }
 
-#if !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
+static INLINE void convolve_2d_sr_vert_12tap_neon(
+    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
+    int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
 
-static INLINE int16x4_t convolve8_horiz_4x4_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
-    const int16x4_t horiz_const) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x4_t sum;
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+                  &s8, &s9, &s10);
+    src_ptr += 11 * src_stride;
 
-  sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, filter_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+    do {
+      int16x4_t s11, s12, s13, s14;
+      load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
 
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshr_n_s16(sum, ROUND0_BITS - 1);
+      int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+                                       s10, s11, y_filter_0_7, y_filter_8_11);
+      int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                       s11, s12, y_filter_0_7, y_filter_8_11);
+      int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                       s12, s13, y_filter_0_7, y_filter_8_11);
+      int32x4_t d3 =
+          convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+                            y_filter_0_7, y_filter_8_11);
+
+      int16x8_t dd01 =
+          vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
+                       vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
+      int16x8_t dd23 =
+          vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
+                       vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
+
+      dd01 = vsubq_s16(dd01, sub_const);
+      dd23 = vsubq_s16(dd23, sub_const);
+
+      uint8x8_t d01 = vqmovun_s16(dd01);
+      uint8x8_t d23 = vqmovun_s16(dd23);
+
+      store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s7 = s11;
+      s8 = s12;
+      s9 = s13;
+      s10 = s14;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+  } else {
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                    &s9, &s10);
+      s += 11 * src_stride;
+
+      do {
+        int16x8_t s11, s12, s13, s14;
+        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+        uint8x8_t d0 =
+            convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                              y_filter_0_7, y_filter_8_11, sub_const);
+        uint8x8_t d1 =
+            convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                              y_filter_0_7, y_filter_8_11, sub_const);
+        uint8x8_t d2 =
+            convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                              s13, y_filter_0_7, y_filter_8_11, sub_const);
+        uint8x8_t d3 =
+            convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                              s14, y_filter_0_7, y_filter_8_11, sub_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s7 = s11;
+        s8 = s12;
+        s9 = s13;
+        s10 = s14;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
 }
 
-static INLINE int16x8_t convolve8_horiz_8x8_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
-    const int16x8_t horiz_const) {
-  const int16x4_t filter_lo = vget_low_s16(filter);
-  const int16x4_t filter_hi = vget_high_s16(filter);
-  int16x8_t sum;
+static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t s4, const int16x4_t s5,
+                                         const int16x4_t s6, const int16x4_t s7,
+                                         const int16x8_t y_filter) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
 
-  sum = horiz_const;
-  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
-  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
-  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
 
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshrq_n_s16(sum, ROUND0_BITS - 1);
+  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
 }
 
-#endif  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
+static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t s6, const int16x8_t s7,
+                                         const int16x8_t y_filter,
+                                         const int16x8_t sub_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+  int16x8_t res =
+      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+  res = vsubq_s16(res, sub_const);
+
+  return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
+                                                 int src_stride,
+                                                 uint8_t *dst_ptr,
+                                                 int dst_stride, int w, int h,
+                                                 const int16x8_t y_filter) {
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src_ptr += 7 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      int16x4_t d3 =
+          convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+      store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s7 = vld1_s16(src_ptr);
+      int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      uint8x8_t d01 =
+          vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+
+      store_u8_4x1(dst_ptr, d01, 0);
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    // Width is a multiple of 8 and height is a multiple of 4.
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, sub_const);
+        uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        y_filter, sub_const);
+        uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        y_filter, sub_const);
+        uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        y_filter, sub_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s7 = vld1q_s16(s);
+        uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, sub_const);
+        vst1_u8(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t s2, const int16x4_t s3,
+                                         const int16x4_t s4, const int16x4_t s5,
+                                         const int16x8_t y_filter) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
+
+  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t y_filter,
+                                         const int16x8_t sub_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
+
+  int16x8_t res =
+      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+  res = vsubq_s16(res, sub_const);
+
+  return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
+                                                 int src_stride,
+                                                 uint8_t *dst_ptr,
+                                                 int dst_stride, int w, int h,
+                                                 const int16x8_t y_filter) {
+  const int bd = 8;
+  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+    src_ptr += 5 * src_stride;
+
+    do {
+#if AOM_ARCH_AARCH64
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+      int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
+      int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter);
+      int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter);
+      int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter);
+
+      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+      store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+#else   // !AOM_ARCH_AARCH64
+      int16x4_t s5 = vld1_s16(src_ptr);
+      int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
+      uint8x8_t d01 =
+          vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+
+      store_u8_4x1(dst_ptr, d01, 0);
+
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      h--;
+#endif  // AOM_ARCH_AARCH64
+    } while (h != 0);
+  } else {
+    // Width is a multiple of 8 and height is a multiple of 4.
+    do {
+      int height = h;
+      int16_t *s = src_ptr;
+      uint8_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+#if AOM_ARCH_AARCH64
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint8x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
+        uint8x8_t d1 =
+            convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
+        uint8x8_t d2 =
+            convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
+        uint8x8_t d3 =
+            convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+#else   // !AOM_ARCH_AARCH64
+        int16x8_t s5 = vld1q_s16(s);
+        uint8x8_t d0 =
+            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
+        vst1_u8(d, d0);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s += src_stride;
+        d += dst_stride;
+        height--;
+#endif  // AOM_ARCH_AARCH64
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
 
 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
new file mode 100644
index 0000000..ba8f7e7
--- /dev/null
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+                                       const int8x16_t filter,
+                                       const int32x4_t correction,
+                                       const uint8x16_t range_limit,
+                                       const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum;
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
+  sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
+  sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+
+  return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+                                       const int8x16_t filter,
+                                       const int32x4_t correction,
+                                       const uint8x16_t range_limit,
+                                       const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples[2], permuted_samples[4];
+  int32x4_t sum[2];
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
+  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
+  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
+  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
+  // Second 4 output values.
+  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0);
+  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
+  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+
+  // Narrow and re-pack.
+  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
+                                   vqrshrn_n_s32(sum[1], FILTER_BITS));
+  return vqmovun_s16(sum_s16);
+}
+
+static INLINE void convolve_x_sr_12tap_neon_dotprod(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter_ptr) {
+  const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
+  const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
+  const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
+  const int8x16_t filter =
+      vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+
+  const int32_t correction_s32 =
+      vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)),
+                           vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS))));
+  // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
+  // shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS.
+  int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
+    // Undo the horizontal offset in the calling function.
+    src += 5;
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x8_t d0 = vld1_u8(s);
+        if (w == 4) {
+          store_u8_4x1(d, d0, 0);
+        } else {
+          vst1_u8(d, d0);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {
+    if (w <= 4) {
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x4_t d0 =
+            convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
+        int16x4_t d1 =
+            convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
+        int16x4_t d2 =
+            convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
+        int16x4_t d3 =
+            convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
+
+        uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+        uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+        store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+        store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+        store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+        dst += 4 * dst_stride;
+        src += 4 * src_stride;
+        h -= 4;
+      } while (h != 0);
+    } else {
+      do {
+        const uint8_t *s = src;
+        uint8_t *d = dst;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2], s1[2], s2[2], s3[2];
+          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+          uint8x8_t d0 =
+              convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
+          uint8x8_t d1 =
+              convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
+          uint8x8_t d2 =
+              convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
+          uint8x8_t d3 =
+              convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
+
+          store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+        src += 4 * src_stride;
+        dst += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+    }
+  }
+}
+
+static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
+                                      const int32x4_t correction,
+                                      const uint8x16_t range_limit,
+                                      const uint8x16_t permute_tbl) {
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t clamped_samples =
+      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0);
+
+  // Packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+                                      const int32x4_t correction,
+                                      const uint8x16_t range_limit,
+                                      const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product. */
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
+  // Second 4 output values.
+  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+  // We halved the convolution filter values so - 1 from the right shift.
+  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const int subpel_x_qn,
+                                    ConvolveParams *conv_params) {
+  if (w == 2 || h == 2) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    return;
+  }
+
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  src -= horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  if (filter_params_x->taps > 8) {
+    convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
+                                     x_filter_ptr);
+    return;
+  }
+
+  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+  // Dot product constants.
+  const int32_t correction_s32 =
+      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+  // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+  // rounding right shift by FILTER_BITS - instead of a first rounding right
+  // shift by ROUND0_BITS, followed by second rounding right shift by
+  // FILTER_BITS - ROUND0_BITS.
+  // The outermost -1 is needed because we will halve the filter values.
+  const int32x4_t correction =
+      vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  if (w <= 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 =
+          convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+      // We halved the convolution filter values so - 1 from the right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 =
+            convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+        uint8x8_t d1 =
+            convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+        uint8x8_t d2 =
+            convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+        uint8x8_t d3 =
+            convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+                                          const int8x16_t filters,
+                                          const int32x4_t correction,
+                                          const uint8x16_t range_limit,
+                                          const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum;
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+  sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+  // Narrow and re-pack.
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+                                          const int8x16_t filters,
+                                          const int32x4_t correction,
+                                          const uint8x16_t range_limit,
+                                          const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples[2], permuted_samples[4];
+  int32x4_t sum[2];
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
+  clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+  sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+  // Second 4 output values.
+  sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
+  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+  sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+  // Narrow and re-pack.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+                      vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11) {
+  const int bd = 8;
+
+  // Special case the following no-op filter as 128 won't fit into the 8-bit
+  // signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+    const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
+    // Undo the horizontal offset in the calling function.
+    src_ptr += 5;
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x8_t s0 = vld1_u8(s);
+        uint16x8_t d0 = vaddw_u8(horiz_const, s0);
+        d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
+        // Store 8 elements to avoid additional branches. This is safe if the
+        // actual block width is < 8 because the intermediate buffer is large
+        // enough to accommodate 128x128 blocks.
+        vst1q_s16(d, vreinterpretq_s16_u16(d0));
+
+        d += 8;
+        s += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+
+  } else {
+    // Narrow filter values to 8-bit.
+    const int16x8x2_t x_filter_s16 = {
+      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+    };
+    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                           vmovn_s16(x_filter_s16.val[1]));
+
+    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+    // - which are generally faster than rounding shifts on modern CPUs.
+    const int32_t horiz_const =
+        ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+    // Dot product constants.
+    const int32x4_t correct_tmp =
+        vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
+                  vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
+    const int32x4_t correction =
+        vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
+    const uint8x16_t range_limit = vdupq_n_u8(128);
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    if (w <= 4) {
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+                                         permute_tbl);
+        int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit,
+                                         permute_tbl);
+        int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit,
+                                         permute_tbl);
+        int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit,
+                                         permute_tbl);
+
+        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h > 4);
+
+      do {
+        uint8x16_t s0 = vld1q_u8(src_ptr);
+        int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+                                         permute_tbl);
+        vst1_s16(dst_ptr, d0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      } while (--h != 0);
+
+    } else {
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2], s1[2], s2[2], s3[2];
+          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+          int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
+                                           range_limit, permute_tbl);
+          int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction,
+                                           range_limit, permute_tbl);
+          int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction,
+                                           range_limit, permute_tbl);
+          int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction,
+                                           range_limit, permute_tbl);
+
+          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h > 4);
+
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2];
+          s0[0] = vld1q_u8(s);
+          s0[1] = vld1q_u8(s + 4);
+          int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
+                                           range_limit, permute_tbl);
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      } while (--h != 0);
+    }
+  }
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16_t permute_tbl) {
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t clamped_samples =
+      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  // Accumulate dot product into 'correction' to account for range clamp.
+  // First 4 output values.
+  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
+  // Second 4 output values.
+  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_neon_dotprod(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16_t *x_filter_ptr) {
+  const int bd = 8;
+  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // The outermost -1 is needed because we halved the filter values.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
+  // Dot product constants.
+  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+  const int32_t correction_s32 =
+      vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+  const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int height = im_h;
+
+  if (w <= 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 =
+          convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 =
+          convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+                                        permute_tbl);
+        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
+                                        permute_tbl);
+        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
+                                        permute_tbl);
+        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
+                                        permute_tbl);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+                                        permute_tbl);
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn,
+                                     const int subpel_y_qn,
+                                     ConvolveParams *conv_params) {
+  if (w == 2 || h == 2) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  if (filter_params_x->taps > 8) {
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+    const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+    convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block,
+                                            im_stride, w, im_h, x_filter_0_7,
+                                            x_filter_8_11);
+
+    convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                   y_filter_0_7, y_filter_8_11);
+  } else {
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+    convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride,
+                                      w, im_h, x_filter_ptr);
+
+    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+    if (clamped_y_taps <= 6) {
+      convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter);
+    } else {
+      convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter);
+    }
+  }
+}
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
new file mode 100644
index 0000000..14140ca
--- /dev/null
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+                                       const int8x16_t filter,
+                                       const uint8x16x3_t permute_tbl,
+                                       const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum;
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+
+  return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+                                       const int8x16_t filter,
+                                       const uint8x16x3_t permute_tbl,
+                                       const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[4];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
+  // Second 4 output values.
+  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+
+  // Narrow and re-pack.
+  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
+                                   vqrshrn_n_s32(sum[1], FILTER_BITS));
+  return vqmovun_s16(sum_s16);
+}
+
+static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
+                                                 int src_stride, uint8_t *dst,
+                                                 int dst_stride, int w, int h,
+                                                 const int16_t *x_filter_ptr) {
+  const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
+  const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
+  const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
+  const int8x16_t filter =
+      vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
+    // Undo the horizontal offset in the calling function.
+    src += 5;
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x8_t d0 = vld1_u8(s);
+        if (w == 4) {
+          store_u8_4x1(d, d0, 0);
+        } else {
+          vst1_u8(d, d0);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+    // right shift by FILTER_BITS - instead of a first rounding right shift by
+    // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+    // ROUND0_BITS.
+    const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
+
+    if (w <= 4) {
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
+        int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
+        int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
+        int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
+
+        uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+        uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+        store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+        store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+        store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+        store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+        dst += 4 * dst_stride;
+        src += 4 * src_stride;
+        h -= 4;
+      } while (h != 0);
+    } else {
+      do {
+        const uint8_t *s = src;
+        uint8_t *d = dst;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2], s1[2], s2[2], s3[2];
+          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+          uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
+          uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
+          uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
+          uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
+
+          store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+        src += 4 * src_stride;
+        dst += 4 * dst_stride;
+        h -= 4;
+      } while (h != 0);
+    }
+  }
+}
+
+static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
+                                      const uint8x16_t permute_tbl,
+                                      const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // First 4 output values.
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0);
+
+  // Packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+                                      const uint8x16x3_t permute_tbl,
+                                      const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0);
+  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
+  // Second 4 output values.
+  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0);
+  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+
+  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+  // We halved the convolution filter values so - 1 from the right shift.
+  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const int subpel_x_qn,
+                                 ConvolveParams *conv_params) {
+  if (w == 2 || h == 2) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    return;
+  }
+
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  src -= horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  if (filter_params_x->taps > 8) {
+    convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+                                  x_filter_ptr);
+    return;
+  }
+
+  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+  // rounding right shift by FILTER_BITS - instead of a first rounding right
+  // shift by ROUND0_BITS, followed by second rounding right shift by
+  // FILTER_BITS - ROUND0_BITS.
+  // The outermost -1 is needed because we will halve the filter values.
+  const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
+
+  if (w <= 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const);
+
+      // We halved the convolution filter values so - 1 from the right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01, 0);
+      store_u8_4x1(dst + 1 * dst_stride, d01, 1);
+      store_u8_4x1(dst + 2 * dst_stride, d23, 0);
+      store_u8_4x1(dst + 3 * dst_stride, d23, 1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
+        uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
+        uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
+        uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+                                          const int8x16_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum;
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+  sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+  // Narrow and re-pack.
+  return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+                                          const int8x16_t filters,
+                                          const uint8x16x3_t permute_tbl,
+                                          const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[4];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+  sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+  // Second 4 output values.
+  sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+  sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+  // Narrow and re-pack.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+                      vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+    const int16x4_t x_filter_8_11) {
+  const int bd = 8;
+
+  // Special case the following no-op filter as 128 won't fit into the
+  // 8-bit signed dot-product instruction:
+  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+    const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
+    // Undo the horizontal offset in the calling function.
+    src_ptr += 5;
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x8_t s0 = vld1_u8(s);
+        uint16x8_t d0 = vaddw_u8(horiz_const, s0);
+        d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
+        // Store 8 elements to avoid additional branches. This is safe if the
+        // actual block width is < 8 because the intermediate buffer is large
+        // enough to accommodate 128x128 blocks.
+        vst1q_s16(d, vreinterpretq_s16_u16(d0));
+
+        d += 8;
+        s += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+
+  } else {
+    // Narrow filter values to 8-bit.
+    const int16x8x2_t x_filter_s16 = {
+      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+    };
+    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                           vmovn_s16(x_filter_s16.val[1]));
+    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+    // - which are generally faster than rounding shifts on modern CPUs.
+    const int32x4_t horiz_const =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    if (w <= 4) {
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x4_t d0 =
+            convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x4_t d1 =
+            convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x4_t d2 =
+            convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x4_t d3 =
+            convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h > 4);
+
+      do {
+        uint8x16_t s0 = vld1q_u8(src_ptr);
+        int16x4_t d0 =
+            convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        vst1_s16(dst_ptr, d0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      } while (--h != 0);
+
+    } else {
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2], s1[2], s2[2], s3[2];
+          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+          int16x8_t d0 =
+              convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+          int16x8_t d1 =
+              convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+          int16x8_t d2 =
+              convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+          int16x8_t d3 =
+              convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * dst_stride;
+        h -= 4;
+      } while (h > 4);
+
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          uint8x16_t s0[2];
+          s0[0] = vld1q_u8(s);
+          s0[1] = vld1q_u8(s + 4);
+          int16x8_t d0 =
+              convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      } while (--h != 0);
+    }
+  }
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const uint8x16_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  // First 4 output values.
+  int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0);
+
+  // We halved the convolution filter values so -1 from the right shift.
+  return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const uint8x16x3_t permute_tbl,
+                                         const int32x4_t horiz_const) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum[2];
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  // First 4 output values.
+  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
+  // Second 4 output values.
+  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
+  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  // We halved the convolution filter values so -1 from the right shift.
+  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_neon_i8mm(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16_t *x_filter_ptr) {
+  const int bd = 8;
+  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  // The outermost -1 is needed because we halved the filter values.
+  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+                                            (1 << ((ROUND0_BITS - 1) - 1)));
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int height = im_h;
+
+  if (w <= 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+    // 4-tap filters are used for blocks having width <= 4.
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter =
+        vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+    src_ptr += 2;
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    // Filter values are even, so halve to reduce intermediate precision reqs.
+    const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        vst1q_s16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  if (w == 2 || h == 2) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  if (filter_params_x->taps > 8) {
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+    const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+    convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
+                                         im_stride, w, im_h, x_filter_0_7,
+                                         x_filter_8_11);
+
+    convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                   y_filter_0_7, y_filter_8_11);
+  } else {
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+    convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w,
+                                   im_h, x_filter_ptr);
+
+    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+    if (clamped_y_taps <= 6) {
+      convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter);
+    } else {
+      convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter);
+    }
+  }
+}
diff --git a/av1/common/arm/highbd_compound_convolve_neon.c b/av1/common/arm/highbd_compound_convolve_neon.c
new file mode 100644
index 0000000..dc3f876
--- /dev/null
+++ b/av1/common/arm/highbd_compound_convolve_neon.c
@@ -0,0 +1,2031 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS
+
+static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
+                                           int src_stride, uint16_t *dst_ptr,
+                                           int dst_stride, int w, int h,
+                                           ConvolveParams *conv_params,
+                                           const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint16x4_t offset_vec = vdup_n_u16(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint16x4_t avg = vhadd_u16(src, ref);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint16x8_t avg = vhaddq_u16(s, r);
+        int32x4_t d0_lo =
+            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+        int32x4_t d0_hi =
+            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+        uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2),
+                                     vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2));
+        d0 = vminq_u16(d0, max);
+        vst1q_u16(dst, d0);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+                                        uint16_t *dst_ptr, int dst_stride,
+                                        int w, int h,
+                                        ConvolveParams *conv_params,
+                                        const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint16x4_t offset_vec = vdup_n_u16(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint16x4_t avg = vhadd_u16(src, ref);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint16x8_t avg = vhaddq_u16(s, r);
+        int32x4_t d0_lo =
+            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+        int32x4_t d0_hi =
+            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+        uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT),
+                                     vqrshrun_n_s32(d0_hi, ROUND_SHIFT));
+        d0 = vminq_u16(d0, max);
+        vst1q_u16(dst, d0);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_12_dist_wtd_comp_avg_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint32x4_t offset_vec = vdupq_n_u32(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+  // Weighted averaging
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+        uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2),
+                                      vqrshrun_n_s32(d1, ROUND_SHIFT - 2));
+        d01 = vminq_u16(d01, max);
+        vst1q_u16(dst, d01);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const uint32x4_t offset_vec = vdupq_n_u32(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+  // Weighted averaging
+  if (w == 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+      uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      vst1_u16(dst_ptr, d0_u16);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+        uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT),
+                                      vqrshrun_n_s32(d1, ROUND_SHIFT));
+        d01 = vminq_u16(d01, max);
+        vst1q_u16(dst, d01);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE uint16x4_t highbd_12_convolve6_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t filter, const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x4_t
+highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x8_t filter, const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_12_convolve6_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t filter, const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+                      vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE uint16x8_t
+highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t filter, const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+  return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  int height = h;
+
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0[6], s1[6], s2[6], s3[6];
+      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5]);
+      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5]);
+      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5]);
+      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5]);
+
+      uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                            s0[5], x_filter, offset_vec);
+      uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                            s1[5], x_filter, offset_vec);
+      uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                            s2[5], x_filter, offset_vec);
+      uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                            s3[5], x_filter, offset_vec);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  int height = h;
+
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0[6], s1[6], s2[6], s3[6];
+      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5]);
+      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5]);
+      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5]);
+      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5]);
+
+      uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], x_filter, offset_vec);
+      uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], x_filter, offset_vec);
+      uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], x_filter, offset_vec);
+      uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], x_filter, offset_vec);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
+}
+
+static INLINE uint16x4_t highbd_12_convolve8_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int32x4_t offset) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x4_t s6, const int16x4_t s7,
+                   const int16x8_t filter, const int32x4_t offset) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_12_convolve8_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int32x4_t offset) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+                      vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t s6, const int16x8_t s7,
+                   const int16x8_t filter, const int32x4_t offset) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter);
+  const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
+                      vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
+                                                 const int16x4_t x_filter,
+                                                 const int32x4_t offset) {
+  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+  return vqshrun_n_s32(sum, 5);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+                                              const int16x4_t x_filter,
+                                              const int32x4_t offset) {
+  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+  return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_x_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width == 4.
+    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+    const int16_t *s = (const int16_t *)(src_ptr + 2);
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+      uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
+      uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
+      uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 =
+            highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+                                  s0[6], s0[7], x_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
+                                  s1[6], s1[7], x_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
+                                  s2[6], s2[7], x_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
+                                  s3[6], s3[7], x_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width == 4.
+    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+    const int16_t *s = (const int16_t *)(src_ptr + 2);
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
+      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
+      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+                               s0[7], x_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
+                               s1[7], x_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
+                               s2[7], x_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
+                               s3[7], x_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_x_neon(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+  int dst16_stride = conv_params->dst_stride;
+  const int im_stride = MAX_SB_SIZE;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1));
+  const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
+                              (1 << (bd + FILTER_BITS)) +
+                              (1 << (bd + FILTER_BITS - 1));
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  src -= horiz_offset;
+
+  // horizontal filter
+  if (bd == 12) {
+    if (conv_params->do_average) {
+      if (x_filter_taps <= 6 && w != 4) {
+        highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
+                                                im_stride, w, h, x_filter_ptr,
+                                                offset_convolve);
+      } else {
+        highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
+                                           w, h, x_filter_ptr, offset_convolve);
+      }
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, conv_params, offset_avg, bd);
+      } else {
+        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                conv_params, offset_avg, bd);
+      }
+    } else {
+      if (x_filter_taps <= 6 && w != 4) {
+        highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
+                                                dst16_stride, w, h,
+                                                x_filter_ptr, offset_convolve);
+      } else {
+        highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
+                                           w, h, x_filter_ptr, offset_convolve);
+      }
+    }
+  } else {
+    if (conv_params->do_average) {
+      if (x_filter_taps <= 6 && w != 4) {
+        highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
+                                             im_stride, w, h, x_filter_ptr,
+                                             offset_convolve);
+      } else {
+        highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
+                                        h, x_filter_ptr, offset_convolve);
+      }
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, offset_avg, bd);
+      } else {
+        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             conv_params, offset_avg, bd);
+      }
+    } else {
+      if (x_filter_taps <= 6 && w != 4) {
+        highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
+                                             dst16_stride, w, h, x_filter_ptr,
+                                             offset_convolve);
+      } else {
+        highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
+                                        h, x_filter_ptr, offset_convolve);
+      }
+    }
+  }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+    s += 5 * src_stride;
+
+    do {
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 =
+          highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+      uint16x4_t d1 =
+          highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+      uint16x4_t d2 =
+          highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+      uint16x4_t d3 =
+          highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 =
+            highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+    s += 5 * src_stride;
+
+    do {
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 =
+          highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+      uint16x4_t d1 =
+          highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+      uint16x4_t d2 =
+          highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+      uint16x4_t d3 =
+          highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 =
+            highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
+                                            y_filter, offset_vec);
+      uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
+                                            y_filter, offset_vec);
+      uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
+                                            y_filter, offset_vec);
+      uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
+                                            y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              y_filter, offset_vec);
+        uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
+                                              y_filter, offset_vec);
+        uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
+                                              y_filter, offset_vec);
+        uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
+                                              y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+static INLINE void highbd_dist_wtd_convolve_y_8tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, const int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
+                                         y_filter, offset_vec);
+      uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
+                                         y_filter, offset_vec);
+      uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
+                                         y_filter, offset_vec);
+      uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
+                                         y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
+                                           y_filter, offset_vec);
+        uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
+                                           y_filter, offset_vec);
+        uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
+                                           y_filter, offset_vec);
+        uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
+                                           y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_y_neon(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  int dst16_stride = conv_params->dst_stride;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+                               (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
+                                (1 << (bd + FILTER_BITS)) +
+                                (1 << (bd + FILTER_BITS - 1));
+
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  src -= vert_offset * src_stride;
+
+  if (bd == 12) {
+    if (conv_params->do_average) {
+      if (y_filter_taps <= 6) {
+        highbd_12_dist_wtd_convolve_y_6tap_neon(
+            src + src_stride, src_stride, im_block, im_stride, w, h,
+            y_filter_ptr, round_offset_conv);
+      } else {
+        highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
+                                                im_stride, w, h, y_filter_ptr,
+                                                round_offset_conv);
+      }
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, conv_params, round_offset_avg,
+                                         bd);
+      } else {
+        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                conv_params, round_offset_avg, bd);
+      }
+    } else {
+      if (y_filter_taps <= 6) {
+        highbd_12_dist_wtd_convolve_y_6tap_neon(
+            src + src_stride, src_stride, dst16, dst16_stride, w, h,
+            y_filter_ptr, round_offset_conv);
+      } else {
+        highbd_12_dist_wtd_convolve_y_8tap_neon(
+            src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+            round_offset_conv);
+      }
+    }
+  } else {
+    if (conv_params->do_average) {
+      if (y_filter_taps <= 6) {
+        highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
+                                             im_block, im_stride, w, h,
+                                             y_filter_ptr, round_offset_conv);
+      } else {
+        highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
+                                             im_stride, w, h, y_filter_ptr,
+                                             round_offset_conv);
+      }
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, round_offset_avg, bd);
+      } else {
+        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             conv_params, round_offset_avg, bd);
+      }
+    } else {
+      if (y_filter_taps <= 6) {
+        highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
+                                             dst16, dst16_stride, w, h,
+                                             y_filter_ptr, round_offset_conv);
+      } else {
+        highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
+                                             dst16_stride, w, h, y_filter_ptr,
+                                             round_offset_conv);
+      }
+    }
+  }
+}
+
+static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
+                                       uint16_t *dst_ptr, int dst_stride, int w,
+                                       int h, const int round_bits,
+                                       const int offset) {
+  if (w <= 4) {
+    const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
+    const uint16x4_t offset_u16 = vdup_n_u16(offset);
+
+    for (int y = 0; y < h; ++y) {
+      const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
+      uint16x4_t d = vshl_u16(s, round_shift_s16);
+      d = vadd_u16(d, offset_u16);
+      if (w == 2) {
+        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+      } else {
+        vst1_u16(dst_ptr + y * dst_stride, d);
+      }
+    }
+  } else {
+    const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
+    const uint16x8_t offset_u16 = vdupq_n_u16(offset);
+
+    for (int y = 0; y < h; ++y) {
+      for (int x = 0; x < w; x += 8) {
+        const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
+        uint16x8_t d = vshlq_u16(s, round_shift_s16);
+        d = vaddq_u16(d, offset_u16);
+        vst1q_u16(dst_ptr + y * dst_stride + x, d);
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
+                                               int src_stride, uint16_t *dst,
+                                               int dst_stride, int w, int h,
+                                               ConvolveParams *conv_params,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+  const int im_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+
+  if (conv_params->do_average) {
+    highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
+                        round_offset);
+  } else {
+    highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
+                        round_offset);
+  }
+
+  if (conv_params->do_average) {
+    if (conv_params->use_dist_wtd_comp_avg) {
+      if (bd == 12) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, conv_params, round_offset, bd);
+      } else {
+        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+                                      h, conv_params, round_offset, bd);
+      }
+    } else {
+      if (bd == 12) {
+        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                conv_params, round_offset, bd);
+      } else {
+        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+                             conv_params, round_offset, bd);
+      }
+    }
+  }
+}
+
+static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t y_filter, const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t y_filter, const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+    s += 5 * src_stride;
+
+    do {
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 =
+          highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+      uint16x4_t d1 =
+          highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+      uint16x4_t d2 =
+          highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+      uint16x4_t d3 =
+          highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
+                                                y_filter, offset_vec);
+        uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
+                                                y_filter, offset_vec);
+        uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
+                                                y_filter, offset_vec);
+        uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
+                                                y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+    const int32x4_t offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    const int32x4_t offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, int offset) {
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w <= 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                              y_filter, offset_vec);
+      uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+                                              y_filter, offset_vec);
+      uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+                                              y_filter, offset_vec);
+      uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                              y_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+                                                y_filter, offset_vec);
+        uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+                                                y_filter, offset_vec);
+        uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+                                                y_filter, offset_vec);
+        uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+                                                y_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  // The smallest block height is 4, and the horizontal convolution needs to
+  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+  assert(h >= 5);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  int height = h;
+
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0[6], s1[6], s2[6], s3[6];
+      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5]);
+      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5]);
+      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5]);
+      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5]);
+
+      uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                            s0[5], x_filter, offset_vec);
+      uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                            s1[5], x_filter, offset_vec);
+      uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                            s2[5], x_filter, offset_vec);
+      uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                            s3[5], x_filter, offset_vec);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0[6];
+      load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+      uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                            s0[5], x_filter, offset_vec);
+      vst1q_u16(d, d0);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  // The smallest block height is 4, and the horizontal convolution needs to
+  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+  assert(h >= 5);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  int height = h;
+
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0[6], s1[6], s2[6], s3[6];
+      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5]);
+      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5]);
+      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5]);
+      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5]);
+
+      uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], x_filter, offset_vec);
+      uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], x_filter, offset_vec);
+      uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], x_filter, offset_vec);
+      uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], x_filter, offset_vec);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0[6];
+      load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+      uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], x_filter, offset_vec);
+      vst1q_u16(d, d0);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  // The smallest block height is 4, and the horizontal convolution needs to
+  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+  assert(h >= 5);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width == 4.
+    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+    const int16_t *s = (const int16_t *)(src_ptr + 1);
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+      uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
+      uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
+      uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      int16x4_t s0[4];
+      load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+      vst1_u16(d, d0);
+
+      s += src_stride;
+      d += dst_stride;
+    } while (--h != 0);
+  } else {
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 =
+            highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+                                  s0[6], s0[7], x_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
+                                  s1[6], s1[7], x_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
+                                  s2[6], s2[7], x_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
+                                  s3[6], s3[7], x_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+
+        uint16x8_t d0 =
+            highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+                                  s0[6], s0[7], x_filter, offset_vec);
+        vst1q_u16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, const int offset) {
+  // The smallest block height is 4, and the horizontal convolution needs to
+  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+  assert(h >= 5);
+  const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width == 4.
+    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+    const int16_t *s = (const int16_t *)(src_ptr + 1);
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
+      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
+      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      int16x4_t s0[4];
+      load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+      vst1_u16(d, d0);
+
+      s += src_stride;
+      d += dst_stride;
+    } while (--h != 0);
+  } else {
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+                               s0[7], x_filter, offset_vec);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
+                               s1[7], x_filter, offset_vec);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
+                               s2[7], x_filter, offset_vec);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
+                               s3[7], x_filter, offset_vec);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+                               s0[7], x_filter, offset_vec);
+        vst1q_u16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_neon(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+  const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+  const int im_h = h + clamped_y_taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = clamped_x_taps / 2 - 1;
+  // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
+  // faster non-rounding non-saturating left shift.
+  const int round_offset_conv_x =
+      (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
+  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset_conv_y = (1 << y_offset_bits);
+  const int round_offset_avg =
+      ((1 << (y_offset_bits - conv_params->round_1)) +
+       (1 << (y_offset_bits - conv_params->round_1 - 1)));
+
+  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  // horizontal filter
+  if (bd == 12) {
+    if (x_filter_taps <= 6 && w != 4) {
+      highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+          round_offset_conv_x);
+    } else {
+      highbd_12_dist_wtd_convolve_2d_horiz_neon(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+          round_offset_conv_x);
+    }
+  } else {
+    if (x_filter_taps <= 6 && w != 4) {
+      highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+          round_offset_conv_x);
+    } else {
+      highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
+                                             im_stride, w, im_h, x_filter_ptr,
+                                             round_offset_conv_x);
+    }
+  }
+
+  // vertical filter
+  if (y_filter_taps <= 6) {
+    if (conv_params->do_average) {
+      highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
+                                                 im_stride, w, h, y_filter_ptr,
+                                                 round_offset_conv_y);
+    } else {
+      highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+          round_offset_conv_y);
+    }
+  } else {
+    if (conv_params->do_average) {
+      highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
+                                                 im_stride, w, h, y_filter_ptr,
+                                                 round_offset_conv_y);
+    } else {
+      highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+          im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+          round_offset_conv_y);
+    }
+  }
+
+  // Do the compound averaging outside the loop, avoids branching within the
+  // main loop
+  if (conv_params->do_average) {
+    if (conv_params->use_dist_wtd_comp_avg) {
+      if (bd == 12) {
+        highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
+                                         w, h, conv_params, round_offset_avg,
+                                         bd);
+      } else {
+        highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+                                      h, conv_params, round_offset_avg, bd);
+      }
+    } else {
+      if (bd == 12) {
+        highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+                                conv_params, round_offset_avg, bd);
+      } else {
+        highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+                             conv_params, round_offset_avg, bd);
+      }
+    }
+  }
+}
diff --git a/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
new file mode 100644
index 0000000..51da025
--- /dev/null
+++ b/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+#define UPSCALE_NORMATIVE_TAPS 8
+
+void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
+                                       uint16_t *dst, int dst_stride, int w,
+                                       int h, const int16_t *x_filters,
+                                       int x0_qn, int x_step_qn, int bd) {
+  const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  static const int32_t kIdx[4] = { 0, 1, 2, 3 };
+  const int32x4_t idx = vld1q_s32(kIdx);
+  const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
+  const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
+  const int32x4_t offset_s32 = vdupq_n_s32(0);
+  const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+
+  const uint16_t *src_ptr = src - horiz_offset;
+  uint16_t *dst_ptr = dst;
+
+  if (w <= 4) {
+    int height = h;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int x_qn = x0_qn;
+
+      // Load 4 src vectors at a time, they might be the same, but we have to
+      // calculate the indices anyway. Doing it in SIMD and then storing the
+      // indices is faster than having to calculate the expression
+      // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
+      // Ideally this should be a gather using the indices, but NEON does not
+      // have that, so have to emulate
+      const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
+      // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
+      // 2
+      const int32x4_t src_idx =
+          vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+      // Similarly for the filter vector indices, we calculate the filter
+      // indices for 4 columns. First we calculate the indices:
+      // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
+      // Then we calculate the actual pointers, multiplying with
+      // UPSCALE_UPSCALE_NORMATIVE_TAPS
+      // again shift left by 1
+      const int32x4_t x_filter4_idx = vshlq_n_s32(
+          vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
+      // Even though pointers are unsigned 32/64-bit ints we do signed
+      // addition The reason for this is that x_qn can be negative, leading to
+      // negative offsets. Argon test
+      // profile0_core/streams/test10573_11003.obu was failing because of
+      // this.
+#if AOM_ARCH_AARCH64
+      uint64x2_t tmp4[2];
+      tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
+          vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
+      tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
+          vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
+      int16_t *src4_ptr[4];
+      uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+      vst1q_u64(tmp_ptr, tmp4[0]);
+      vst1q_u64(tmp_ptr + 2, tmp4[1]);
+
+      // filter vectors
+      tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
+          vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
+          vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+      tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
+          vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
+          vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+      const int16_t *x_filter4_ptr[4];
+      tmp_ptr = (uint64_t *)&x_filter4_ptr;
+      vst1q_u64(tmp_ptr, tmp4[0]);
+      vst1q_u64(tmp_ptr + 2, tmp4[1]);
+#else
+      uint32x4_t tmp4;
+      tmp4 = vreinterpretq_u32_s32(
+          vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
+      int16_t *src4_ptr[4];
+      uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+      vst1q_u32(tmp_ptr, tmp4);
+
+      // filter vectors
+      tmp4 = vreinterpretq_u32_s32(
+          vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
+                    vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+      const int16_t *x_filter4_ptr[4];
+      tmp_ptr = (uint32_t *)&x_filter4_ptr;
+      vst1q_u32(tmp_ptr, tmp4);
+#endif  // AOM_ARCH_AARCH64
+      // Load source
+      int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+      int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+      int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+      int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+      // Actually load the filters
+      const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+      const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+      const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+      const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+      // Group low and high parts and transpose
+      int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+                                 vget_low_s16(x_filter1),
+                                 vget_low_s16(x_filter2),
+                                 vget_low_s16(x_filter3) };
+      int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+                                 vget_high_s16(x_filter1),
+                                 vget_high_s16(x_filter2),
+                                 vget_high_s16(x_filter3) };
+      transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+      transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+      // Run the 2D Scale convolution
+      uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+          s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+      d0 = vmin_u16(d0, max);
+
+      if (w == 2) {
+        store_u16_2x1(d + 0 * dst_stride, d0, 0);
+      } else {
+        vst1_u16(d + 0 * dst_stride, d0);
+      }
+
+      src_ptr += src_stride;
+      d += dst_stride;
+      height--;
+    } while (height > 0);
+  } else {
+    int height = h;
+
+    do {
+      int width = w;
+      int x_qn = x0_qn;
+      uint16_t *d = dst_ptr;
+      const uint16_t *s = src_ptr;
+
+      do {
+        // Load 4 src vectors at a time, they might be the same, but we have to
+        // calculate the indices anyway. Doing it in SIMD and then storing the
+        // indices is faster than having to calculate the expression
+        // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
+        // Ideally this should be a gather using the indices, but NEON does not
+        // have that, so have to emulate
+        const int32x4_t xqn_idx =
+            vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
+        // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
+        // = 2
+        const int32x4_t src_idx =
+            vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+
+        // Similarly for the filter vector indices, we calculate the filter
+        // indices for 4 columns. First we calculate the indices:
+        // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
+        // Then we calculate the actual pointers, multiplying with
+        // UPSCALE_UPSCALE_NORMATIVE_TAPS
+        // again shift left by 1
+        const int32x4_t x_filter4_idx = vshlq_n_s32(
+            vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
+            1);
+        // Even though pointers are unsigned 32/64-bit ints we do signed
+        // addition The reason for this is that x_qn can be negative, leading to
+        // negative offsets. Argon test
+        // profile0_core/streams/test10573_11003.obu was failing because of
+        // this.
+#if AOM_ARCH_AARCH64
+        uint64x2_t tmp4[2];
+        tmp4[0] = vreinterpretq_u64_s64(
+            vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
+        tmp4[1] = vreinterpretq_u64_s64(
+            vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
+        int16_t *src4_ptr[4];
+        uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+        vst1q_u64(tmp_ptr, tmp4[0]);
+        vst1q_u64(tmp_ptr + 2, tmp4[1]);
+
+        // filter vectors
+        tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
+            vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
+            vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+        tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
+            vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
+            vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+        const int16_t *x_filter4_ptr[4];
+        tmp_ptr = (uint64_t *)&x_filter4_ptr;
+        vst1q_u64(tmp_ptr, tmp4[0]);
+        vst1q_u64(tmp_ptr + 2, tmp4[1]);
+#else
+        uint32x4_t tmp4;
+        tmp4 = vreinterpretq_u32_s32(
+            vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
+        int16_t *src4_ptr[4];
+        uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+        vst1q_u32(tmp_ptr, tmp4);
+
+        // filter vectors
+        tmp4 = vreinterpretq_u32_s32(
+            vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
+                      vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+        const int16_t *x_filter4_ptr[4];
+        tmp_ptr = (uint32_t *)&x_filter4_ptr;
+        vst1q_u32(tmp_ptr, tmp4);
+#endif  // AOM_ARCH_AARCH64
+
+        // Load source
+        int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+        int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+        int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+        int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+        // Actually load the filters
+        const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+        const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+        const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+        const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+        // Group low and high parts and transpose
+        int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+                                   vget_low_s16(x_filter1),
+                                   vget_low_s16(x_filter2),
+                                   vget_low_s16(x_filter3) };
+        int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+                                   vget_high_s16(x_filter1),
+                                   vget_high_s16(x_filter2),
+                                   vget_high_s16(x_filter3) };
+        transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+        transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+        // Run the 2D Scale X convolution
+        uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+            s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+        d0 = vmin_u16(d0, max);
+        vst1_u16(d, d0);
+
+        x_qn += 4 * x_step_qn;
+        d += 4;
+        width -= 4;
+      } while (width > 0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      height--;
+    } while (height > 0);
+  }
+}
diff --git a/av1/common/arm/highbd_convolve_neon.c b/av1/common/arm/highbd_convolve_neon.c
index fb18e28..3f5ff9e 100644
--- a/av1/common/arm/highbd_convolve_neon.c
+++ b/av1/common/arm/highbd_convolve_neon.c
@@ -17,62 +17,87 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
-#include "av1/common/arm/highbd_convolve_neon.h"
+
+static INLINE uint16x4_t
+highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                     const int16x8_t y_filter) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                     const int16x8_t y_filter) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
 
 static INLINE void highbd_convolve_y_sr_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, const int bd) {
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
 
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x4_t d0, d1, d2, d3;
-    uint16x8_t d01, d23;
+  if (w == 4) {
     const int16_t *s = (const int16_t *)(src_ptr + src_stride);
     uint16_t *d = dst_ptr;
 
+    int16x4_t s0, s1, s2, s3, s4;
     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
     s += 5 * src_stride;
 
     do {
+      int16x4_t s5, s6, s7, s8;
       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
 
-      d0 = highbd_convolve6_4_s32_s16(s0, s1, s2, s3, s4, s5, y_filter_0_7,
-                                      zero_s32);
-      d1 = highbd_convolve6_4_s32_s16(s1, s2, s3, s4, s5, s6, y_filter_0_7,
-                                      zero_s32);
-      d2 = highbd_convolve6_4_s32_s16(s2, s3, s4, s5, s6, s7, y_filter_0_7,
-                                      zero_s32);
-      d3 = highbd_convolve6_4_s32_s16(s3, s4, s5, s6, s7, s8, y_filter_0_7,
-                                      zero_s32);
+      uint16x4_t d0 =
+          highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+      uint16x4_t d1 =
+          highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+      uint16x4_t d2 =
+          highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+      uint16x4_t d3 =
+          highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
 
-      d01 = vcombine_u16(d0, d1);
-      d23 = vcombine_u16(d2, d3);
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
 
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u16q_2x1(d + 2 * dst_stride, d23, 0);
-          store_u16q_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-        if (h != 2) {
-          vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-          vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
-        }
-      }
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -82,42 +107,37 @@
       s += 4 * src_stride;
       d += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    // if width is a multiple of 8 & height is a multiple of 4
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x8_t d0, d1, d2, d3;
-
+    // Width is a multiple of 8 and height is a multiple of 4.
     do {
       int height = h;
       const int16_t *s = (const int16_t *)(src_ptr + src_stride);
       uint16_t *d = dst_ptr;
 
+      int16x8_t s0, s1, s2, s3, s4;
       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
       s += 5 * src_stride;
 
       do {
+        int16x8_t s5, s6, s7, s8;
         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
 
-        d0 = highbd_convolve6_8_s32_s16(s0, s1, s2, s3, s4, s5, y_filter_0_7,
-                                        zero_s32);
-        d1 = highbd_convolve6_8_s32_s16(s1, s2, s3, s4, s5, s6, y_filter_0_7,
-                                        zero_s32);
-        d2 = highbd_convolve6_8_s32_s16(s2, s3, s4, s5, s6, s7, y_filter_0_7,
-                                        zero_s32);
-        d3 = highbd_convolve6_8_s32_s16(s3, s4, s5, s6, s7, s8, y_filter_0_7,
-                                        zero_s32);
+        uint16x8_t d0 =
+            highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+        uint16x8_t d1 =
+            highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+        uint16x8_t d2 =
+            highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+        uint16x8_t d3 =
+            highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -127,66 +147,96 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
 
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
+static INLINE uint16x4_t highbd_convolve8_4_y(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_y(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
 static INLINE void highbd_convolve_y_sr_8tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int bd) {
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
 
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x4_t d0, d1, d2, d3;
-    uint16x8_t d01, d23;
-
+  if (w == 4) {
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     s += 7 * src_stride;
 
     do {
+      int16x4_t s7, s8, s9, s10;
       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-      d0 = highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                      zero_s32);
-      d1 = highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                      zero_s32);
-      d2 = highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                      zero_s32);
-      d3 = highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                      zero_s32);
+      uint16x4_t d0 =
+          highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      uint16x4_t d1 =
+          highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      uint16x4_t d2 =
+          highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      uint16x4_t d3 =
+          highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
 
-      d01 = vcombine_u16(d0, d1);
-      d23 = vcombine_u16(d2, d3);
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
 
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u16q_2x1(d + 2 * dst_stride, d23, 0);
-          store_u16q_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-        if (h != 2) {
-          vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-          vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
-        }
-      }
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -198,40 +248,36 @@
       s += 4 * src_stride;
       d += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x8_t d0, d1, d2, d3;
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
       do {
+        int16x8_t s7, s8, s9, s10;
         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, zero_s32);
-        d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        y_filter, zero_s32);
-        d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        y_filter, zero_s32);
-        d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        y_filter, zero_s32);
+        uint16x8_t d0 =
+            highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+        uint16x8_t d1 =
+            highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+        uint16x8_t d2 =
+            highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+        uint16x8_t d3 =
+            highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -243,71 +289,117 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
+static INLINE uint16x4_t highbd_convolve12_4_y(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_y(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
 static INLINE void highbd_convolve_y_sr_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, int bd) {
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
 
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-    uint16x4_t d0, d1, d2, d3;
-    uint16x8_t d01, d23;
-
+  if (w == 4) {
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
     load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
                   &s9, &s10);
     s += 11 * src_stride;
 
     do {
+      int16x4_t s11, s12, s13, s14;
       load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
 
-      d0 = highbd_convolve12_y_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
-                                         s10, s11, y_filter_0_7, y_filter_8_11,
-                                         zero_s32);
-      d1 = highbd_convolve12_y_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9,
-                                         s10, s11, s12, y_filter_0_7,
-                                         y_filter_8_11, zero_s32);
-      d2 = highbd_convolve12_y_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                         s11, s12, s13, y_filter_0_7,
-                                         y_filter_8_11, zero_s32);
-      d3 = highbd_convolve12_y_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                         s12, s13, s14, y_filter_0_7,
-                                         y_filter_8_11, zero_s32);
+      uint16x4_t d0 =
+          highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                s11, y_filter_0_7, y_filter_8_11);
+      uint16x4_t d1 =
+          highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                s12, y_filter_0_7, y_filter_8_11);
+      uint16x4_t d2 =
+          highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                s13, y_filter_0_7, y_filter_8_11);
+      uint16x4_t d3 =
+          highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                                s14, y_filter_0_7, y_filter_8_11);
 
-      d01 = vcombine_u16(d0, d1);
-      d23 = vcombine_u16(d2, d3);
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
 
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u16q_2x1(d + 2 * dst_stride, d23, 0);
-          store_u16q_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-        if (h != 2) {
-          vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-          vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
-        }
-      }
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -323,46 +415,41 @@
       s += 4 * src_stride;
       d += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    uint16x8_t d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
       load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
                     &s9, &s10);
       s += 11 * src_stride;
 
       do {
+        int16x8_t s11, s12, s13, s14;
         load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
 
-        d0 = highbd_convolve12_y_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, s8,
-                                           s9, s10, s11, y_filter_0_7,
-                                           y_filter_8_11, zero_s32);
-        d1 = highbd_convolve12_y_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, s9,
-                                           s10, s11, s12, y_filter_0_7,
-                                           y_filter_8_11, zero_s32);
-        d2 = highbd_convolve12_y_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                           s11, s12, s13, y_filter_0_7,
-                                           y_filter_8_11, zero_s32);
-        d3 = highbd_convolve12_y_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                                           s12, s13, s14, y_filter_0_7,
-                                           y_filter_8_11, zero_s32);
+        uint16x8_t d0 =
+            highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+                                  s11, y_filter_0_7, y_filter_8_11);
+        uint16x8_t d1 =
+            highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+                                  s12, y_filter_0_7, y_filter_8_11);
+        uint16x8_t d2 =
+            highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                  s13, y_filter_0_7, y_filter_8_11);
+        uint16x8_t d3 =
+            highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                                  s13, s14, y_filter_0_7, y_filter_8_11);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -378,12 +465,12 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
 
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -391,6 +478,11 @@
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_y,
                                    const int subpel_y_qn, int bd) {
+  if (w == 2 || h == 2) {
+    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_y, subpel_y_qn, bd);
+    return;
+  }
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   const int vert_offset = filter_params_y->taps / 2 - 1;
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
@@ -413,207 +505,366 @@
                                  y_filter_ptr, bd);
 }
 
-static INLINE void highbd_convolve_x_sr_8tap_neon(
+static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
+                                              const int16x8_t x_filter,
+                                              const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+  int32x4_t sum0 = offset;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
+
+  int32x4_t sum1 = offset;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     int bd) {
   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const int16x8_t bits_s16 = vdupq_n_s16(-bits);
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
+  // This shim allows to do only one rounding shift instead of two.
+  const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
 
-  if (w <= 4) {
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0, d1;
-    uint16x8_t d01;
+  int height = h;
 
+  do {
+    int width = w;
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
     do {
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+      int16x8_t s0[6], s1[6], s2[6], s3[6];
+      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5]);
+      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5]);
+      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5]);
+      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5]);
 
-      d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter, shift_s32,
-                                           zero_s32);
-      d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter, shift_s32,
-                                           zero_s32);
+      uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset);
+      uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset);
+      uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset);
+      uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset);
 
-      d01 = vcombine_u16(d0, d1);
-      d01 = vqrshlq_u16(d01, bits_s16);
-      d01 = vminq_u16(d01, max);
+      d0 = vminq_u16(d0, max);
+      d1 = vminq_u16(d1, max);
+      d2 = vminq_u16(d2, max);
+      d3 = vminq_u16(d3, max);
 
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      }
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
-    } while (h > 0);
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height != 0);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+                                              const int16x4_t x_filter,
+                                              const int32x4_t offset) {
+  int32x4_t sum = offset;
+  sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
+  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
+                                              const int16x8_t x_filter,
+                                              const int32x4_t offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+  int32x4_t sum0 = offset;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+
+  int32x4_t sum1 = offset;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
+                                             int src_stride, uint16_t *dst_ptr,
+                                             int dst_stride, int w, int h,
+                                             const int16_t *x_filter_ptr,
+                                             ConvolveParams *conv_params,
+                                             int bd) {
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  // This shim allows to do only one rounding shift instead of two.
+  const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width == 4.
+    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+    const int16_t *s = (const int16_t *)(src_ptr + 2);
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset);
+      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset);
+      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset);
+      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset);
+
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
     int height = h;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0, d1, d2, d3;
+
     do {
       int width = w;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
-      load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
-      s += 8;
-
       do {
-        load_s16_8x4(s, src_stride, &s1, &s3, &s5, &s7);
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
 
-        d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter, shift_s32,
-                                             zero_s32);
-        d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter, shift_s32,
-                                             zero_s32);
-        d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter, shift_s32,
-                                             zero_s32);
-        d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter, shift_s32,
-                                             zero_s32);
-
-        d0 = vqrshlq_u16(d0, bits_s16);
-        d1 = vqrshlq_u16(d1, bits_s16);
-        d2 = vqrshlq_u16(d2, bits_s16);
-        d3 = vqrshlq_u16(d3, bits_s16);
+        uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset);
+        uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset);
+        uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset);
+        uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        s0 = s1;
-        s2 = s3;
-        s4 = s5;
-        s6 = s7;
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       height -= 4;
-    } while (height > 0);
+    } while (height != 0);
   }
 }
 
+static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
+                                               const int16x8_t x_filter_0_7,
+                                               const int16x4_t x_filter_8_11,
+                                               const int32x4_t offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+  int32x4_t sum = offset;
+  sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
+                                               const int16x8_t x_filter_0_7,
+                                               const int16x4_t x_filter_8_11,
+                                               const int32x4_t offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+  int32x4_t sum0 = offset;
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
+
+  int32x4_t sum1 = offset;
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
 static INLINE void highbd_convolve_x_sr_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     int bd) {
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const int16x8_t bits_s16 = vdupq_n_s16(-bits);
+  // This shim allows to do only one rounding shift instead of two.
+  const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
 
-  if (w <= 4) {
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0, d1;
-    uint16x8_t d01;
-
+  if (w == 4) {
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
     do {
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+      int16x4_t s0[12], s1[12], s2[12], s3[12];
+      load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                    &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+                    &s0[11]);
+      load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                    &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+                    &s1[11]);
+      load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                    &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+                    &s2[11]);
+      load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                    &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+                    &s3[11]);
 
-      d0 = highbd_convolve12_horiz4_s32_s16(s0, s1, x_filter_0_7, x_filter_8_11,
-                                            shift_s32, zero_s32);
-      d1 = highbd_convolve12_horiz4_s32_s16(s2, s3, x_filter_0_7, x_filter_8_11,
-                                            shift_s32, zero_s32);
+      uint16x4_t d0 =
+          highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset);
+      uint16x4_t d1 =
+          highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset);
+      uint16x4_t d2 =
+          highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset);
+      uint16x4_t d3 =
+          highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset);
 
-      d01 = vcombine_u16(d0, d1);
-      d01 = vqrshlq_u16(d01, bits_s16);
-      d01 = vminq_u16(d01, max);
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
 
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      }
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
-    } while (h > 0);
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
     int height = h;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
-    uint16x8_t d0, d1, d2, d3;
+
     do {
       int width = w;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
-      load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
-      s += 8;
-
       do {
-        load_s16_8x4(s, src_stride, &s1, &s4, &s7, &s10);
-        load_s16_8x4(s + 8, src_stride, &s2, &s5, &s8, &s11);
+        int16x8_t s0[12], s1[12], s2[12], s3[12];
+        load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                      &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+                      &s0[11]);
+        load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                      &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+                      &s1[11]);
+        load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                      &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+                      &s2[11]);
+        load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                      &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+                      &s3[11]);
 
-        d0 = highbd_convolve12_horiz8_s32_s16(
-            s0, s1, s2, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
-        d1 = highbd_convolve12_horiz8_s32_s16(
-            s3, s4, s5, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
-        d2 = highbd_convolve12_horiz8_s32_s16(
-            s6, s7, s8, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
-        d3 = highbd_convolve12_horiz8_s32_s16(
-            s9, s10, s11, x_filter_0_7, x_filter_8_11, shift_s32, zero_s32);
-
-        d0 = vqrshlq_u16(d0, bits_s16);
-        d1 = vqrshlq_u16(d1, bits_s16);
-        d2 = vqrshlq_u16(d2, bits_s16);
-        d3 = vqrshlq_u16(d3, bits_s16);
+        uint16x8_t d0 =
+            highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset);
+        uint16x8_t d1 =
+            highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset);
+        uint16x8_t d2 =
+            highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset);
+        uint16x8_t d3 =
+            highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        s0 = s1;
-        s1 = s2;
-        s3 = s4;
-        s4 = s5;
-        s6 = s7;
-        s7 = s8;
-        s9 = s10;
-        s10 = s11;
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       height -= 4;
-    } while (height > 0);
+    } while (height != 0);
   }
 }
 
@@ -622,6 +873,11 @@
                                    const InterpFilterParams *filter_params_x,
                                    const int subpel_x_qn,
                                    ConvolveParams *conv_params, int bd) {
+  if (w == 2 || h == 2) {
+    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, subpel_x_qn, conv_params, bd);
+    return;
+  }
   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
@@ -634,70 +890,250 @@
                                     x_filter_ptr, conv_params, bd);
     return;
   }
+  if (x_filter_taps <= 6 && w != 4) {
+    highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
+                                   x_filter_ptr, conv_params, bd);
+    return;
+  }
 
-  highbd_convolve_x_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
-                                 x_filter_ptr, conv_params, bd);
+  highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
+                            x_filter_ptr, conv_params, bd);
 }
 
-static INLINE void highbd_convolve_2d_y_sr_8tap_neon(
+static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x8_t y_filter, const int32x4_t round_shift,
+    const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+  sum = vshlq_s32(sum, round_shift);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t y_filter, const int32x4_t round_shift,
+    const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+  sum0 = vshlq_s32(sum0, round_shift);
+  sum1 = vshlq_s32(sum1, round_shift);
+
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
-    int bd, const int offset, const int correction) {
+    int bd, const int offset) {
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
   const int round1_shift = conv_params->round_1;
   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
-  const int32x4_t correction_s32 = vdupq_n_s32(correction);
 
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x4_t d0, d1, d2, d3;
-    uint16x8_t d01, d23;
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+    int16x4_t s0, s1, s2, s3, s4;
+    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+    s += 5 * src_stride;
 
+    do {
+      int16x4_t s5, s6, s7, s8;
+      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+      uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
+                                              round1_shift_s32, offset_s32);
+      uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
+                                              round1_shift_s32, offset_s32);
+      uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
+                                              round1_shift_s32, offset_s32);
+      uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
+                                              round1_shift_s32, offset_s32);
+
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+      s += 5 * src_stride;
+
+      do {
+        int16x8_t s5, s6, s7, s8;
+        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+        uint16x8_t d0 = highbd_convolve6_8_2d_v(
+            s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32);
+        uint16x8_t d1 = highbd_convolve6_8_2d_v(
+            s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32);
+        uint16x8_t d2 = highbd_convolve6_8_2d_v(
+            s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32);
+        uint16x8_t d3 = highbd_convolve6_8_2d_v(
+            s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+    const int32x4_t round_shift, const int32x4_t offset) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+  sum = vshlq_s32(sum, round_shift);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    const int32x4_t round_shift, const int32x4_t offset) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+  sum0 = vshlq_s32(sum0, round_shift);
+  sum1 = vshlq_s32(sum1, round_shift);
+
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
+    int bd, const int offset) {
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const int32x4_t offset_s32 = vdupq_n_s32(offset);
+  const int round1_shift = conv_params->round_1;
+  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
+
+  if (w == 4) {
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     s += 7 * src_stride;
 
     do {
+      int16x4_t s7, s8, s9, s10;
       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-      d0 = highbd_convolve8_4_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                         y_filter, round1_shift_s32, offset_s32,
-                                         correction_s32);
-      d1 = highbd_convolve8_4_sr_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                         y_filter, round1_shift_s32, offset_s32,
-                                         correction_s32);
-      d2 = highbd_convolve8_4_sr_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
-                                         y_filter, round1_shift_s32, offset_s32,
-                                         correction_s32);
-      d3 = highbd_convolve8_4_sr_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
-                                         y_filter, round1_shift_s32, offset_s32,
-                                         correction_s32);
+      uint16x4_t d0 =
+          highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                  round1_shift_s32, offset_s32);
+      uint16x4_t d1 =
+          highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                  round1_shift_s32, offset_s32);
+      uint16x4_t d2 =
+          highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                  round1_shift_s32, offset_s32);
+      uint16x4_t d3 =
+          highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                  round1_shift_s32, offset_s32);
 
-      d01 = vcombine_u16(d0, d1);
-      d23 = vcombine_u16(d2, d3);
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
 
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u16q_2x1(d + 2 * dst_stride, d23, 0);
-          store_u16q_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-        if (h != 2) {
-          vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-          vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
-        }
-      }
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -709,44 +1145,40 @@
       s += 4 * src_stride;
       d += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x8_t d0, d1, d2, d3;
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
       do {
+        int16x8_t s7, s8, s9, s10;
         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        d0 = highbd_convolve8_8_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                           y_filter, round1_shift_s32,
-                                           offset_s32, correction_s32);
-        d1 = highbd_convolve8_8_sr_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                           y_filter, round1_shift_s32,
-                                           offset_s32, correction_s32);
-        d2 = highbd_convolve8_8_sr_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
-                                           y_filter, round1_shift_s32,
-                                           offset_s32, correction_s32);
-        d3 = highbd_convolve8_8_sr_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
-                                           y_filter, round1_shift_s32,
-                                           offset_s32, correction_s32);
+        uint16x8_t d0 =
+            highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round1_shift_s32, offset_s32);
+        uint16x8_t d1 =
+            highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    round1_shift_s32, offset_s32);
+        uint16x8_t d2 =
+            highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    round1_shift_s32, offset_s32);
+        uint16x8_t d3 =
+            highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    round1_shift_s32, offset_s32);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -758,75 +1190,126 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
-static INLINE void highbd_convolve_2d_y_sr_12tap_neon(
+static INLINE uint16x4_t highbd_convolve12_4_2d_v(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    const int32x4_t round_shift, const int32x4_t offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+  sum = vshlq_s32(sum, round_shift);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+    const int32x4_t round_shift, const int32x4_t offset) {
+  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+  sum0 = vshlq_s32(sum0, round_shift);
+  sum1 = vshlq_s32(sum1, round_shift);
+
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
-    const int bd, const int offset, const int correction) {
+    const int bd, const int offset) {
   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
   const int round1_shift = conv_params->round_1;
   const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
-  const int32x4_t correction_s32 = vdupq_n_s32(correction);
 
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-    uint16x4_t d0, d1, d2, d3;
-    uint16x8_t d01, d23;
-
+  if (w == 4) {
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
     load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
                   &s9, &s10);
     s += 11 * src_stride;
 
     do {
+      int16x4_t s11, s12, s13, s14;
       load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
 
-      d0 = highbd_convolve12_y_4_sr_s32_s16(
+      uint16x4_t d0 = highbd_convolve12_4_2d_v(
           s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
-      d1 = highbd_convolve12_y_4_sr_s32_s16(
+          y_filter_8_11, round1_shift_s32, offset_s32);
+      uint16x4_t d1 = highbd_convolve12_4_2d_v(
           s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
-      d2 = highbd_convolve12_y_4_sr_s32_s16(
+          y_filter_8_11, round1_shift_s32, offset_s32);
+      uint16x4_t d2 = highbd_convolve12_4_2d_v(
           s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
-      d3 = highbd_convolve12_y_4_sr_s32_s16(
+          y_filter_8_11, round1_shift_s32, offset_s32);
+      uint16x4_t d3 = highbd_convolve12_4_2d_v(
           s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
-          y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
+          y_filter_8_11, round1_shift_s32, offset_s32);
 
-      d01 = vcombine_u16(d0, d1);
-      d23 = vcombine_u16(d2, d3);
+      d0 = vmin_u16(d0, vget_low_u16(max));
+      d1 = vmin_u16(d1, vget_low_u16(max));
+      d2 = vmin_u16(d2, vget_low_u16(max));
+      d3 = vmin_u16(d3, vget_low_u16(max));
 
-      d01 = vminq_u16(d01, max);
-      d23 = vminq_u16(d23, max);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u16q_2x1(d + 2 * dst_stride, d23, 0);
-          store_u16q_2x1(d + 3 * dst_stride, d23, 2);
-        }
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-        if (h != 2) {
-          vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-          vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
-        }
-      }
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
       s0 = s4;
       s1 = s5;
@@ -842,46 +1325,41 @@
       s += 4 * src_stride;
       d += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    uint16x8_t d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-
     do {
       int height = h;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
       load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
                     &s9, &s10);
       s += 11 * src_stride;
 
       do {
+        int16x8_t s11, s12, s13, s14;
         load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
 
-        d0 = highbd_convolve12_y_8_sr_s32_s16(
+        uint16x8_t d0 = highbd_convolve12_8_2d_v(
             s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
-        d1 = highbd_convolve12_y_8_sr_s32_s16(
+            y_filter_8_11, round1_shift_s32, offset_s32);
+        uint16x8_t d1 = highbd_convolve12_8_2d_v(
             s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
-        d2 = highbd_convolve12_y_8_sr_s32_s16(
+            y_filter_8_11, round1_shift_s32, offset_s32);
+        uint16x8_t d2 = highbd_convolve12_8_2d_v(
             s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
-        d3 = highbd_convolve12_y_8_sr_s32_s16(
+            y_filter_8_11, round1_shift_s32, offset_s32);
+        uint16x8_t d3 = highbd_convolve12_8_2d_v(
             s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
-            y_filter_8_11, round1_shift_s32, offset_s32, correction_s32);
+            y_filter_8_11, round1_shift_s32, offset_s32);
 
         d0 = vminq_u16(d0, max);
         d1 = vminq_u16(d1, max);
         d2 = vminq_u16(d2, max);
         d3 = vminq_u16(d3, max);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -897,186 +1375,465 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
 
       src_ptr += 8;
       dst_ptr += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
-static INLINE void highbd_convolve_x_8tap_neon(
+static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
+                                                 const int16x8_t x_filter,
+                                                 const int32x4_t shift_s32,
+                                                 const int32x4_t offset) {
+  // Values at indices 0 and 7 of y_filter are zero.
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
+
+  sum0 = vqrshlq_s32(sum0, shift_s32);
+  sum1 = vqrshlq_s32(sum1, shift_s32);
+
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     const int offset) {
-  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+  // The smallest block height processed by the SIMD functions is 4, and the
+  // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+  // for the vertical convolution.
+  assert(h >= 5);
   const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
 
-  if (w <= 4) {
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0, d1;
-    uint16x8_t d01;
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+  int height = h;
 
+  do {
+    int width = w;
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
     do {
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+      int16x8_t s0[6], s1[6], s2[6], s3[6];
+      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5]);
+      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5]);
+      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5]);
+      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5]);
 
-      d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter, shift_s32,
-                                           offset_s32);
-      d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter, shift_s32,
-                                           offset_s32);
+      uint16x8_t d0 =
+          highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+      uint16x8_t d1 =
+          highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
+      uint16x8_t d2 =
+          highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
+      uint16x8_t d3 =
+          highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
 
-      d01 = vcombine_u16(d0, d1);
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      }
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += 4 * src_stride;
+    dst_ptr += 4 * dst_stride;
+    height -= 4;
+  } while (height > 4);
+  do {
+    int width = w;
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
 
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
-    } while (h > 0);
+    do {
+      int16x8_t s0[6];
+      load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+      uint16x8_t d0 =
+          highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+      vst1q_u16(d, d0);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--height != 0);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
+                                                 const int16x4_t x_filter,
+                                                 const int32x4_t shift_s32,
+                                                 const int32x4_t offset) {
+  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+  sum = vqrshlq_s32(sum, shift_s32);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
+                                                 const int16x8_t x_filter,
+                                                 const int32x4_t shift_s32,
+                                                 const int32x4_t offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+
+  sum0 = vqrshlq_s32(sum0, shift_s32);
+  sum1 = vqrshlq_s32(sum1, shift_s32);
+
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
+    const int offset) {
+  // The smallest block height processed by the SIMD functions is 4, and the
+  // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+  // for the vertical convolution.
+  assert(h >= 5);
+  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+  const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+  if (w == 4) {
+    // 4-tap filters are used for blocks having width <= 4.
+    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+    const int16_t *s = (const int16_t *)(src_ptr + 1);
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
+      uint16x4_t d1 =
+          highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
+      uint16x4_t d2 =
+          highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
+      uint16x4_t d3 =
+          highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      int16x4_t s0[4];
+      load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
+
+      vst1_u16(d, d0);
+
+      s += src_stride;
+      d += dst_stride;
+    } while (--h != 0);
   } else {
+    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
     int height = h;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0, d1, d2, d3;
+
     do {
       int width = w;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
-      load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
-      s += 8;
-
       do {
-        load_s16_8x4(s, src_stride, &s1, &s3, &s5, &s7);
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
 
-        d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter, shift_s32,
-                                             offset_s32);
-        d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter, shift_s32,
-                                             offset_s32);
-        d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter, shift_s32,
-                                             offset_s32);
-        d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter, shift_s32,
-                                             offset_s32);
+        uint16x8_t d0 =
+            highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+        uint16x8_t d1 =
+            highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
+        uint16x8_t d2 =
+            highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
+        uint16x8_t d3 =
+            highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        s0 = s1;
-        s2 = s3;
-        s4 = s5;
-        s6 = s7;
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       height -= 4;
-    } while (height > 0);
+    } while (height > 4);
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        int16x8_t s0[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+        vst1q_u16(d, d0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
   }
 }
 
-static INLINE void highbd_convolve_2d_x_sr_12tap_neon(
+static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
+                                                  const int16x8_t x_filter_0_7,
+                                                  const int16x4_t x_filter_8_11,
+                                                  const int32x4_t shift_s32,
+                                                  const int32x4_t offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
+  sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
+  sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
+  sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
+  sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
+  sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
+  sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
+  sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
+  sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
+  sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
+  sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
+  sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
+
+  sum = vqrshlq_s32(sum, shift_s32);
+  return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
+                                                  const int16x8_t x_filter_0_7,
+                                                  const int16x4_t x_filter_8_11,
+                                                  const int32x4_t shift_s32,
+                                                  const int32x4_t offset) {
+  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
+
+  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
+
+  sum0 = vqrshlq_s32(sum0, shift_s32);
+  sum1 = vqrshlq_s32(sum1, shift_s32);
+
+  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon(
     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
     const int offset) {
+  // The smallest block height processed by the SIMD functions is 4, and the
+  // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+  // for the vertical convolution.
+  assert(h >= 5);
   const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
   const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
   const int32x4_t offset_s32 = vdupq_n_s32(offset);
 
-  if (w <= 4) {
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0, d1;
-    uint16x8_t d01;
-
+  if (w == 4) {
     const int16_t *s = (const int16_t *)src_ptr;
     uint16_t *d = dst_ptr;
 
     do {
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+      int16x4_t s0[12], s1[12], s2[12], s3[12];
+      load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                    &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+                    &s0[11]);
+      load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                    &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+                    &s1[11]);
+      load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                    &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+                    &s2[11]);
+      load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                    &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+                    &s3[11]);
 
-      d0 = highbd_convolve12_horiz4_s32_s16(s0, s1, x_filter_0_7, x_filter_8_11,
-                                            shift_s32, offset_s32);
-      d1 = highbd_convolve12_horiz4_s32_s16(s2, s3, x_filter_0_7, x_filter_8_11,
-                                            shift_s32, offset_s32);
+      uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
+                                               shift_s32, offset_s32);
+      uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
+                                               shift_s32, offset_s32);
+      uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
+                                               shift_s32, offset_s32);
+      uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
+                                               shift_s32, offset_s32);
 
-      d01 = vcombine_u16(d0, d1);
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
 
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      }
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
 
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
-    } while (h > 0);
+    do {
+      int16x4_t s0[12];
+      load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
+                    &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
+
+      uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
+                                               shift_s32, offset_s32);
+
+      vst1_u16(d, d0);
+
+      s += src_stride;
+      d += dst_stride;
+    } while (--h != 0);
   } else {
     int height = h;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
-    uint16x8_t d0, d1, d2, d3;
+
     do {
       int width = w;
       const int16_t *s = (const int16_t *)src_ptr;
       uint16_t *d = dst_ptr;
 
-      load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
-      s += 8;
+      do {
+        int16x8_t s0[12], s1[12], s2[12], s3[12];
+        load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                      &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+                      &s0[11]);
+        load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                      &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+                      &s1[11]);
+        load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                      &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+                      &s2[11]);
+        load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                      &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+                      &s3[11]);
+
+        uint16x8_t d0 = highbd_convolve12_8_2d_h(
+            s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+        uint16x8_t d1 = highbd_convolve12_8_2d_h(
+            s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+        uint16x8_t d2 = highbd_convolve12_8_2d_h(
+            s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+        uint16x8_t d3 = highbd_convolve12_8_2d_h(
+            s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 4);
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
 
       do {
-        load_s16_8x4(s, src_stride, &s1, &s4, &s7, &s10);
-        load_s16_8x4(s + 8, src_stride, &s2, &s5, &s8, &s11);
+        int16x8_t s0[12];
+        load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                      &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+                      &s0[11]);
 
-        d0 = highbd_convolve12_horiz8_s32_s16(
-            s0, s1, s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
-        d1 = highbd_convolve12_horiz8_s32_s16(
-            s3, s4, s5, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
-        d2 = highbd_convolve12_horiz8_s32_s16(
-            s6, s7, s8, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
-        d3 = highbd_convolve12_horiz8_s32_s16(
-            s9, s10, s11, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+        uint16x8_t d0 = highbd_convolve12_8_2d_h(
+            s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+        vst1q_u16(d, d0);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
-
-        s0 = s1;
-        s1 = s2;
-        s3 = s4;
-        s4 = s5;
-        s6 = s7;
-        s7 = s8;
-        s9 = s10;
-        s10 = s11;
         s += 8;
         d += 8;
         width -= 8;
       } while (width > 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--height != 0);
   }
 }
 
@@ -1087,18 +1844,29 @@
                                     const int subpel_x_qn,
                                     const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
+  if (w == 2 || h == 2) {
+    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params, bd);
+    return;
+  }
   DECLARE_ALIGNED(16, uint16_t,
                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
-  const int im_h = h + filter_params_y->taps - 1;
+  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+  const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
+
+  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int im_h = h + clamped_y_taps - 1;
   const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int vert_offset = clamped_y_taps / 2 - 1;
+  const int horiz_offset = clamped_x_taps / 2 - 1;
   const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int y_offset_initial = (1 << y_offset_bits);
-  const int y_offset_correction =
-      ((1 << (y_offset_bits - conv_params->round_1)) +
-       (1 << (y_offset_bits - conv_params->round_1 - 1)));
+  // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
+  // simple shift left instead of a rounding saturating shift left.
+  const int y_offset =
+      (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
 
   const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
 
@@ -1107,1275 +1875,246 @@
   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  if (filter_params_x->taps > 8) {
-    highbd_convolve_2d_x_sr_12tap_neon(src_ptr, src_stride, im_block, im_stride,
-                                       w, im_h, x_filter_ptr, conv_params,
-                                       x_offset_initial);
+  if (x_filter_taps > 8) {
+    highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
+                                           im_stride, w, im_h, x_filter_ptr,
+                                           conv_params, x_offset_initial);
 
-    highbd_convolve_2d_y_sr_12tap_neon(im_block, im_stride, dst, dst_stride, w,
-                                       h, y_filter_ptr, conv_params, bd,
-                                       y_offset_initial, y_offset_correction);
-  } else {
-    highbd_convolve_x_8tap_neon(src_ptr, src_stride, im_block, im_stride, w,
-                                im_h, x_filter_ptr, conv_params,
-                                x_offset_initial);
-
-    highbd_convolve_2d_y_sr_8tap_neon(im_block, im_stride, dst, dst_stride, w,
-                                      h, y_filter_ptr, conv_params, bd,
-                                      y_offset_initial, y_offset_correction);
-  }
-}
-
-static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, const int subpel_x_qn, const int x_step_qn,
-    const InterpFilterParams *filter_params, ConvolveParams *conv_params,
-    const int offset) {
-  const uint32x4_t idx = { 0, 1, 2, 3 };
-  const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK);
-  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
-  const int32x4_t offset_s32 = vdupq_n_s32(offset);
-
-  if (w <= 4) {
-    int height = h;
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0;
-
-    uint16_t *d = dst_ptr;
-
-    do {
-      int x_qn = subpel_x_qn;
-
-      // Load 4 src vectors at a time, they might be the same, but we have to
-      // calculate the indices anyway. Doing it in SIMD and then storing the
-      // indices is faster than having to calculate the expression
-      // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
-      // Ideally this should be a gather using the indices, but NEON does not
-      // have that, so have to emulate
-      const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
-      // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
-      // 2
-      const uint32x4_t src_idx_u32 =
-          vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
-#if AOM_ARCH_AARCH64
-      uint64x2_t src4[2];
-      src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
-                          vget_low_u32(src_idx_u32));
-      src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
-                          vget_high_u32(src_idx_u32));
-      int16_t *src4_ptr[4];
-      uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
-      vst1q_u64(tmp_ptr, src4[0]);
-      vst1q_u64(tmp_ptr + 2, src4[1]);
-#else
-      uint32x4_t src4;
-      src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32);
-      int16_t *src4_ptr[4];
-      uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
-      vst1q_u32(tmp_ptr, src4);
-#endif  // AOM_ARCH_AARCH64
-      // Same for the filter vectors
-      const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
-          vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
-      int32_t x_filter4_idx[4];
-      vst1q_s32(x_filter4_idx, filter_idx_s32);
-      const int16_t *x_filter4_ptr[4];
-
-      // Load source
-      s0 = vld1q_s16(src4_ptr[0]);
-      s1 = vld1q_s16(src4_ptr[1]);
-      s2 = vld1q_s16(src4_ptr[2]);
-      s3 = vld1q_s16(src4_ptr[3]);
-
-      // We could easily do this using SIMD as well instead of calling the
-      // inline function 4 times.
-      x_filter4_ptr[0] =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]);
-      x_filter4_ptr[1] =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]);
-      x_filter4_ptr[2] =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]);
-      x_filter4_ptr[3] =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]);
-
-      // Actually load the filters
-      const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
-      const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
-      const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
-      const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
-      // Group low and high parts and transpose
-      int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
-                                 vget_low_s16(x_filter1),
-                                 vget_low_s16(x_filter2),
-                                 vget_low_s16(x_filter3) };
-      int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
-                                 vget_high_s16(x_filter1),
-                                 vget_high_s16(x_filter2),
-                                 vget_high_s16(x_filter3) };
-      transpose_u16_4x4((uint16x4_t *)filters_lo);
-      transpose_u16_4x4((uint16x4_t *)filters_hi);
-
-      // Run the 2D Scale convolution
-      d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
-          s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
-      if (w == 2) {
-        store_u16_2x1(d + 0 * dst_stride, d0, 0);
-      } else {
-        vst1_u16(d + 0 * dst_stride, d0);
-      }
-
-      src_ptr += src_stride;
-      d += dst_stride;
-      height--;
-    } while (height > 0);
-  } else {
-    int height = h;
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0;
-
-    do {
-      int width = w;
-      int x_qn = subpel_x_qn;
-      uint16_t *d = dst_ptr;
-      const uint16_t *s = src_ptr;
-
-      do {
-        // Load 4 src vectors at a time, they might be the same, but we have to
-        // calculate the indices anyway. Doing it in SIMD and then storing the
-        // indices is faster than having to calculate the expression
-        // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
-        // Ideally this should be a gather using the indices, but NEON does not
-        // have that, so have to emulate
-        const uint32x4_t xqn_idx =
-            vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
-        // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
-        // = 2
-        const uint32x4_t src_idx_u32 =
-            vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
-#if AOM_ARCH_AARCH64
-        uint64x2_t src4[2];
-        src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
-                            vget_low_u32(src_idx_u32));
-        src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
-                            vget_high_u32(src_idx_u32));
-        int16_t *src4_ptr[4];
-        uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
-        vst1q_u64(tmp_ptr, src4[0]);
-        vst1q_u64(tmp_ptr + 2, src4[1]);
-#else
-        uint32x4_t src4;
-        src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32);
-        int16_t *src4_ptr[4];
-        uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
-        vst1q_u32(tmp_ptr, src4);
-#endif  // AOM_ARCH_AARCH64
-        // Same for the filter vectors
-        const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
-            vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
-        int32_t x_filter4_idx[4];
-        vst1q_s32(x_filter4_idx, filter_idx_s32);
-        const int16_t *x_filter4_ptr[4];
-
-        // Load source
-        s0 = vld1q_s16(src4_ptr[0]);
-        s1 = vld1q_s16(src4_ptr[1]);
-        s2 = vld1q_s16(src4_ptr[2]);
-        s3 = vld1q_s16(src4_ptr[3]);
-
-        // We could easily do this using SIMD as well instead of calling the
-        // inline function 4 times.
-        x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(
-            filter_params, x_filter4_idx[0]);
-        x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(
-            filter_params, x_filter4_idx[1]);
-        x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(
-            filter_params, x_filter4_idx[2]);
-        x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(
-            filter_params, x_filter4_idx[3]);
-
-        // Actually load the filters
-        const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
-        const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
-        const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
-        const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
-        // Group low and high parts and transpose
-        int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
-                                   vget_low_s16(x_filter1),
-                                   vget_low_s16(x_filter2),
-                                   vget_low_s16(x_filter3) };
-        int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
-                                   vget_high_s16(x_filter1),
-                                   vget_high_s16(x_filter2),
-                                   vget_high_s16(x_filter3) };
-        transpose_u16_4x4((uint16x4_t *)filters_lo);
-        transpose_u16_4x4((uint16x4_t *)filters_hi);
-
-        // Run the 2D Scale X convolution
-        d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
-            s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
-        vst1_u16(d, d0);
-
-        x_qn += 4 * x_step_qn;
-        d += 4;
-        width -= 4;
-      } while (width > 0);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      height--;
-    } while (height > 0);
-  }
-}
-
-static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, const int subpel_y_qn, const int y_step_qn,
-    const InterpFilterParams *filter_params, const int round1_bits,
-    const int offset) {
-  const int32x4_t offset_s32 = vdupq_n_s32(1 << offset);
-
-  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits);
-  if (w <= 4) {
-    int height = h;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t d0;
-
-    uint16_t *d = dst_ptr;
-
-    int y_qn = subpel_y_qn;
-    do {
-      const int16_t *s =
-          (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-
-      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      const int16_t *y_filter_ptr =
-          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
-      const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-
-      d0 = highbd_convolve8_4_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                         y_filter, round1_shift_s32, offset_s32,
-                                         vdupq_n_s32(0));
-
-      if (w == 2) {
-        store_u16_2x1(d, d0, 0);
-      } else {
-        vst1_u16(d, d0);
-      }
-
-      y_qn += y_step_qn;
-      d += dst_stride;
-      height--;
-    } while (height > 0);
-  } else {
-    int width = w;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0;
-
-    do {
-      int height = h;
-      int y_qn = subpel_y_qn;
-
-      uint16_t *d = dst_ptr;
-
-      do {
-        const int16_t *s =
-            (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-        const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-        const int16_t *y_filter_ptr =
-            av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
-        const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-
-        d0 = highbd_convolve8_8_sr_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                           y_filter, round1_shift_s32,
-                                           offset_s32, vdupq_n_s32(0));
-        vst1q_u16(d, d0);
-
-        y_qn += y_step_qn;
-        d += dst_stride;
-        height--;
-      } while (height > 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      width -= 8;
-    } while (width > 0);
-  }
-}
-
-static INLINE void highbd_dist_wtd_comp_avg_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, ConvolveParams *conv_params, const int round_bits,
-    const int offset, const int bd) {
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  const int dst16_stride = conv_params->dst_stride;
-  const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
-  const int16x4_t offset_s16 = vdup_n_s16(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-  uint16x4_t fwd_offset_u16 = vdup_n_u16(conv_params->fwd_offset);
-  uint16x4_t bck_offset_u16 = vdup_n_u16(conv_params->bck_offset);
-
-  // Weighted averaging
-  if (w <= 4) {
-    for (int y = 0; y < h; ++y) {
-      const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
-      const uint16x4_t d16 = vld1_u16(dst16 + y * dst16_stride);
-      // We use vmull_u16/vmlal_u16 instead of of vmull_s16/vmlal_s16
-      // because the latter sign-extend and the values are non-negative.
-      // However, d0/d1 are signed-integers and we use vqmovun
-      // to do saturated narrowing to unsigned.
-      int32x4_t d0 = vreinterpretq_s32_u32(vmull_u16(d16, fwd_offset_u16));
-      d0 = vreinterpretq_s32_u32(
-          vmlal_u16(vreinterpretq_u32_s32(d0), s, bck_offset_u16));
-      d0 = vshrq_n_s32(d0, DIST_PRECISION_BITS);
-      // Subtract round offset and convolve round
-      d0 = vqrshlq_s32(vsubw_s16(d0, offset_s16), round_shift_s32);
-      uint16x4_t d = vqmovun_s32(d0);
-      d = vmin_u16(d, vget_low_u16(max));
-      if (w == 2) {
-        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
-      } else {
-        vst1_u16(dst_ptr + y * dst_stride, d);
-      }
-    }
-  } else {
-    for (int y = 0; y < h; ++y) {
-      for (int x = 0; x < w; x += 8) {
-        const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
-        const uint16x8_t d16 = vld1q_u16(dst16 + y * dst16_stride + x);
-        // We use vmull_u16/vmlal_u16 instead of of vmull_s16/vmlal_s16
-        // because the latter sign-extend and the values are non-negative.
-        // However, d0/d1 are signed-integers and we use vqmovun
-        // to do saturated narrowing to unsigned.
-        int32x4_t d0 =
-            vreinterpretq_s32_u32(vmull_u16(vget_low_u16(d16), fwd_offset_u16));
-        int32x4_t d1 = vreinterpretq_s32_u32(
-            vmull_u16(vget_high_u16(d16), fwd_offset_u16));
-        d0 = vreinterpretq_s32_u32(vmlal_u16(vreinterpretq_u32_s32(d0),
-                                             vget_low_u16(s), bck_offset_u16));
-        d1 = vreinterpretq_s32_u32(vmlal_u16(vreinterpretq_u32_s32(d1),
-                                             vget_high_u16(s), bck_offset_u16));
-        d0 = vshrq_n_s32(d0, DIST_PRECISION_BITS);
-        d1 = vshrq_n_s32(d1, DIST_PRECISION_BITS);
-        d0 = vqrshlq_s32(vsubw_s16(d0, offset_s16), round_shift_s32);
-        d1 = vqrshlq_s32(vsubw_s16(d1, offset_s16), round_shift_s32);
-        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
-        d01 = vminq_u16(d01, max);
-        vst1q_u16(dst_ptr + y * dst_stride + x, d01);
-      }
-    }
-  }
-}
-
-static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
-                                        uint16_t *dst_ptr, int dst_stride,
-                                        int w, int h,
-                                        ConvolveParams *conv_params,
-                                        const int round_bits, const int offset,
-                                        const int bd) {
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  const int dst16_stride = conv_params->dst_stride;
-  const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
-  const int16x4_t offset_s16 = vdup_n_s16(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-  if (w <= 4) {
-    for (int y = 0; y < h; ++y) {
-      const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
-      const uint16x4_t d16 = vld1_u16(dst16 + y * dst16_stride);
-      int32x4_t s_s32 = vreinterpretq_s32_u32(vmovl_u16(s));
-      int32x4_t d16_s32 = vreinterpretq_s32_u32(vmovl_u16(d16));
-      int32x4_t d0 = vhaddq_s32(s_s32, d16_s32);
-      d0 = vsubw_s16(d0, offset_s16);
-      d0 = vqrshlq_s32(d0, round_shift_s32);
-      uint16x4_t d = vqmovun_s32(d0);
-      d = vmin_u16(d, vget_low_u16(max));
-      if (w == 2) {
-        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
-      } else {
-        vst1_u16(dst_ptr + y * dst_stride, d);
-      }
-    }
-  } else {
-    for (int y = 0; y < h; ++y) {
-      for (int x = 0; x < w; x += 8) {
-        const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
-        const uint16x8_t d16 = vld1q_u16(dst16 + y * dst16_stride + x);
-        int32x4_t s_lo = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(s)));
-        int32x4_t s_hi = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(s)));
-        int32x4_t d16_lo = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(d16)));
-        int32x4_t d16_hi = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(d16)));
-        int32x4_t d0 = vhaddq_s32(s_lo, d16_lo);
-        int32x4_t d1 = vhaddq_s32(s_hi, d16_hi);
-        d0 = vsubw_s16(d0, offset_s16);
-        d1 = vsubw_s16(d1, offset_s16);
-        d0 = vqrshlq_s32(d0, round_shift_s32);
-        d1 = vqrshlq_s32(d1, round_shift_s32);
-        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
-        d01 = vminq_u16(d01, max);
-        vst1q_u16(dst_ptr + y * dst_stride + x, d01);
-      }
-    }
-  }
-}
-
-static INLINE void highbd_convolve_correct_offset_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, const int round_bits, const int offset, const int bd) {
-  const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
-  const int16x4_t offset_s16 = vdup_n_s16(offset);
-  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-  if (w <= 4) {
-    for (int y = 0; y < h; ++y) {
-      const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride);
-      const int32x4_t d0 =
-          vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32);
-      uint16x4_t d = vqmovun_s32(d0);
-      d = vmin_u16(d, vget_low_u16(max));
-      if (w == 2) {
-        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
-      } else {
-        vst1_u16(dst_ptr + y * dst_stride, d);
-      }
-    }
-  } else {
-    for (int y = 0; y < h; ++y) {
-      for (int x = 0; x < w; x += 8) {
-        // Subtract round offset and convolve round
-        const int16x8_t s =
-            vld1q_s16((const int16_t *)src_ptr + y * src_stride + x);
-        const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16),
-                                         round_shift_s32);
-        const int32x4_t d1 = vqrshlq_s32(
-            vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32);
-        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
-        d01 = vminq_u16(d01, max);
-        vst1q_u16(dst_ptr + y * dst_stride + x, d01);
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_2d_scale_neon(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
-    ConvolveParams *conv_params, int bd) {
-  uint16_t *im_block = (uint16_t *)aom_memalign(
-      16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
-  if (!im_block) return;
-  uint16_t *im_block2 = (uint16_t *)aom_memalign(
-      16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
-  if (!im_block2) {
-    aom_free(im_block);  // free the first block and return.
+    highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
+                                          w, h, y_filter_ptr, conv_params, bd,
+                                          y_offset);
     return;
   }
-
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
-  const int im_stride = MAX_SB_SIZE;
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  assert(bits >= 0);
-
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const int x_offset_bits = (1 << (bd + FILTER_BITS - 1));
-  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int y_offset_correction =
-      ((1 << (y_offset_bits - conv_params->round_1)) +
-       (1 << (y_offset_bits - conv_params->round_1 - 1)));
-
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  const int dst16_stride = conv_params->dst_stride;
-
-  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
-  highbd_convolve_2d_x_scale_8tap_neon(
-      src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn,
-      filter_params_x, conv_params, x_offset_bits);
-  if (conv_params->is_compound && !conv_params->do_average) {
-    highbd_convolve_2d_y_scale_8tap_neon(
-        im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn,
-        filter_params_y, conv_params->round_1, y_offset_bits);
+  if (x_filter_taps <= 6 && w != 4) {
+    highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
+                                          im_stride, w, im_h, x_filter_ptr,
+                                          conv_params, x_offset_initial);
   } else {
-    highbd_convolve_2d_y_scale_8tap_neon(
-        im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn,
-        filter_params_y, conv_params->round_1, y_offset_bits);
+    highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                                     w, im_h, x_filter_ptr, conv_params,
+                                     x_offset_initial);
   }
 
-  // Do the compound averaging outside the loop, avoids branching within the
-  // main loop
-  if (conv_params->is_compound) {
-    if (conv_params->do_average) {
-      if (conv_params->use_dist_wtd_comp_avg) {
-        highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
-                                      h, conv_params, bits, y_offset_correction,
-                                      bd);
-      } else {
-        highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
-                             conv_params, bits, y_offset_correction, bd);
-      }
-    }
+  if (y_filter_taps <= 6) {
+    highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, y_filter_ptr, conv_params, bd,
+                                         y_offset);
   } else {
-    highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride,
-                                        w, h, bits, y_offset_correction, bd);
-  }
-  aom_free(im_block);
-  aom_free(im_block2);
-}
-
-static INLINE void highbd_convolve_dist_wtd_x_8tap_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
-    const int offset) {
-  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
-  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
-  const int weight_bits = FILTER_BITS - conv_params->round_1;
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
-  const int32x4_t weight_s32 = vdupq_n_s32(1 << weight_bits);
-  const int32x4_t offset_s32 = vdupq_n_s32(offset);
-
-  if (w <= 4) {
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0, d1;
-    uint16x8_t d01;
-
-    const int16_t *s = (const int16_t *)src_ptr;
-    uint16_t *d = dst_ptr;
-
-    do {
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      load_s16_8x2(s + 8, src_stride, &s1, &s3);
-
-      d0 = highbd_convolve8_wtd_horiz4_s32_s16(
-          s0, s1, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
-      d1 = highbd_convolve8_wtd_horiz4_s32_s16(
-          s2, s3, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
-      d01 = vcombine_u16(d0, d1);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      }
-
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else {
-    int height = h;
-    int16x8_t s0, s1, s2, s3;
-    uint16x8_t d0, d1;
-
-    do {
-      int width = w;
-      const int16_t *s = (const int16_t *)src_ptr;
-      uint16_t *d = dst_ptr;
-
-      load_s16_8x2(s, src_stride, &s0, &s2);
-      s += 8;
-
-      do {
-        load_s16_8x2(s, src_stride, &s1, &s3);
-
-        d0 = highbd_convolve8_wtd_horiz8_s32_s16(
-            s0, s1, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
-        d1 = highbd_convolve8_wtd_horiz8_s32_s16(
-            s2, s3, x_filter, shift_s32, zero_s32, weight_s32, offset_s32);
-
-        store_u16_8x2(d, dst_stride, d0, d1);
-
-        s0 = s1;
-        s2 = s3;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 2 * src_stride;
-      dst_ptr += 2 * dst_stride;
-      height -= 2;
-    } while (height > 0);
+    highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
+                                         w, h, y_filter_ptr, conv_params, bd,
+                                         y_offset);
   }
 }
 
-static INLINE void highbd_convolve_dist_wtd_y_8tap_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
-    const int offset) {
-  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
-  const int weight_bits = FILTER_BITS - conv_params->round_1;
-  const int32x4_t zero_s32 = vdupq_n_s32(0);
-  const int32x4_t weight_s32 = vdupq_n_s32(1 << weight_bits);
-  const int32x4_t offset_s32 = vdupq_n_s32(offset);
-
-  if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x4_t d0, d1;
-    uint16x8_t d01;
-
-    const int16_t *s = (const int16_t *)src_ptr;
-    uint16_t *d = dst_ptr;
-
-    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    s += 7 * src_stride;
-
-    do {
-      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-      d0 = highbd_convolve8_wtd_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                          y_filter, shift_s32, zero_s32,
-                                          weight_s32, offset_s32);
-      d1 = highbd_convolve8_wtd_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                          y_filter, shift_s32, zero_s32,
-                                          weight_s32, offset_s32);
-      d01 = vcombine_u16(d0, d1);
-
-      if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-      } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-      }
-
-      s0 = s2;
-      s1 = s3;
-      s2 = s4;
-      s3 = s5;
-      s4 = s6;
-      s5 = s7;
-      s6 = s8;
-      s += 2 * src_stride;
-      d += 2 * dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x8_t d0, d1;
-
-    do {
-      int height = h;
-      const int16_t *s = (const int16_t *)src_ptr;
-      uint16_t *d = dst_ptr;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      do {
-        load_s16_8x2(s, src_stride, &s7, &s8);
-
-        d0 = highbd_convolve8_wtd_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                            y_filter, shift_s32, zero_s32,
-                                            weight_s32, offset_s32);
-        d1 = highbd_convolve8_wtd_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                            y_filter, shift_s32, zero_s32,
-                                            weight_s32, offset_s32);
-
-        store_u16_8x2(d, dst_stride, d0, d1);
-
-        s0 = s2;
-        s1 = s3;
-        s2 = s4;
-        s3 = s5;
-        s4 = s6;
-        s5 = s7;
-        s6 = s8;
-        s += 2 * src_stride;
-        d += 2 * dst_stride;
-        height -= 2;
-      } while (height > 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-void av1_highbd_dist_wtd_convolve_x_neon(
+// Filter used is [64, 64].
+void av1_highbd_convolve_x_sr_intrabc_neon(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params, int bd) {
-  DECLARE_ALIGNED(16, uint16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  const int im_stride = MAX_SB_SIZE;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  assert(round_bits >= 0);
-
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
-  src -= horiz_offset;
-
-  // horizontal filter
-  if (conv_params->do_average) {
-    highbd_convolve_dist_wtd_x_8tap_neon(src, src_stride, im_block, im_stride,
-                                         w, h, x_filter_ptr, conv_params,
-                                         round_offset);
-  } else {
-    highbd_convolve_dist_wtd_x_8tap_neon(src, src_stride, dst16, dst16_stride,
-                                         w, h, x_filter_ptr, conv_params,
-                                         round_offset);
-  }
-
-  if (conv_params->do_average) {
-    if (conv_params->use_dist_wtd_comp_avg) {
-      highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                    conv_params, round_bits, round_offset, bd);
-    } else {
-      highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                           conv_params, round_bits, round_offset, bd);
-    }
-  }
-}
-
-void av1_highbd_dist_wtd_convolve_y_neon(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
-    ConvolveParams *conv_params, int bd) {
-  DECLARE_ALIGNED(16, uint16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  assert(round_bits >= 0);
-
-  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
-  src -= vert_offset * src_stride;
-
-  // vertical filter
-  if (conv_params->do_average) {
-    highbd_convolve_dist_wtd_y_8tap_neon(src, src_stride, im_block, im_stride,
-                                         w, h, y_filter_ptr, conv_params,
-                                         round_offset);
-  } else {
-    highbd_convolve_dist_wtd_y_8tap_neon(src, src_stride, dst16, dst16_stride,
-                                         w, h, y_filter_ptr, conv_params,
-                                         round_offset);
-  }
-
-  if (conv_params->do_average) {
-    if (conv_params->use_dist_wtd_comp_avg) {
-      highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                    conv_params, round_bits, round_offset, bd);
-    } else {
-      highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                           conv_params, round_bits, round_offset, bd);
-    }
-  }
-}
-
-static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
-                                       uint16_t *dst_ptr, int dst_stride, int w,
-                                       int h, const int round_bits,
-                                       const int offset) {
-  if (w <= 4) {
-    const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
-    const uint16x4_t offset_u16 = vdup_n_u16(offset);
-
-    for (int y = 0; y < h; ++y) {
-      const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
-      uint16x4_t d = vshl_u16(s, round_shift_s16);
-      d = vadd_u16(d, offset_u16);
-      if (w == 2) {
-        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
-      } else {
-        vst1_u16(dst_ptr + y * dst_stride, d);
-      }
-    }
-  } else {
-    const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
-    const uint16x8_t offset_u16 = vdupq_n_u16(offset);
-
-    for (int y = 0; y < h; ++y) {
-      for (int x = 0; x < w; x += 8) {
-        const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
-        uint16x8_t d = vshlq_u16(s, round_shift_s16);
-        d = vaddq_u16(d, offset_u16);
-        vst1q_u16(dst_ptr + y * dst_stride + x, d);
-      }
-    }
-  }
-}
-
-void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
-                                               int src_stride, uint16_t *dst,
-                                               int dst_stride, int w, int h,
-                                               ConvolveParams *conv_params,
-                                               int bd) {
-  DECLARE_ALIGNED(16, uint16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
-
-  const int im_stride = MAX_SB_SIZE;
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  assert(round_bits >= 0);
-
-  if (conv_params->do_average) {
-    highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
-                        round_offset);
-  } else {
-    highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
-                        round_offset);
-  }
-
-  if (conv_params->do_average) {
-    if (conv_params->use_dist_wtd_comp_avg) {
-      highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                    conv_params, round_bits, round_offset, bd);
-    } else {
-      highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
-                           conv_params, round_bits, round_offset, bd);
-    }
-  }
-}
-
-static INLINE void highbd_convolve_y_8tap_neon(
-    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
-    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
-    int offset) {
-  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-  const int32x4_t offset_s32 = vdupq_n_s32(offset);
-  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_1);
+  assert(subpel_x_qn == 8);
+  assert(filter_params_x->taps == 2);
+  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+  (void)bd;
 
   if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x4_t d0, d1, d2, d3;
-    uint16x8_t d01, d23;
-
-    const int16_t *s = (const int16_t *)src_ptr;
-    uint16_t *d = dst_ptr;
-
-    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    s += 7 * src_stride;
-
     do {
-      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+      uint16x4_t s0 = vld1_u16(src);
+      uint16x4_t s1 = vld1_u16(src + 1);
 
-      d0 = highbd_convolve8_sr_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                         y_filter, shift_s32, offset_s32);
-      d1 = highbd_convolve8_sr_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                         y_filter, shift_s32, offset_s32);
-      d2 = highbd_convolve8_sr_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
-                                         y_filter, shift_s32, offset_s32);
-      d3 = highbd_convolve8_sr_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
-                                         y_filter, shift_s32, offset_s32);
-
-      d01 = vcombine_u16(d0, d1);
-      d23 = vcombine_u16(d2, d3);
+      uint16x4_t d0 = vrhadd_u16(s0, s1);
 
       if (w == 2) {
-        store_u16q_2x1(d + 0 * dst_stride, d01, 0);
-        store_u16q_2x1(d + 1 * dst_stride, d01, 2);
-        if (h != 2) {
-          store_u16q_2x1(d + 2 * dst_stride, d23, 0);
-          store_u16q_2x1(d + 3 * dst_stride, d23, 2);
-        }
+        store_u16_2x1(dst, d0, 0);
       } else {
-        vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
-        if (h != 2) {
-          vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
-          vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
-        }
+        vst1_u16(dst, d0);
       }
 
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      s += 4 * src_stride;
-      d += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
   } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint16x8_t d0, d1, d2, d3;
     do {
-      int height = h;
-      const int16_t *s = (const int16_t *)src_ptr;
-      uint16_t *d = dst_ptr;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
+      const uint16_t *src_ptr = src;
+      uint16_t *dst_ptr = dst;
+      int width = w;
 
       do {
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+        uint16x8_t s0 = vld1q_u16(src_ptr);
+        uint16x8_t s1 = vld1q_u16(src_ptr + 1);
 
-        d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, offset_s32);
-        d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        y_filter, offset_s32);
-        d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        y_filter, offset_s32);
-        d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        y_filter, offset_s32);
+        uint16x8_t d0 = vrhaddq_u16(s0, s1);
 
-        if (h == 2) {
-          store_u16_8x2(d, dst_stride, d0, d1);
-        } else {
-          store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-        }
+        vst1q_u16(dst_ptr, d0);
 
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w > 0);
+        src_ptr += 8;
+        dst_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
   }
 }
 
-void av1_highbd_dist_wtd_convolve_2d_neon(
+// Filter used is [64, 64].
+void av1_highbd_convolve_y_sr_intrabc_neon(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    int bd) {
+  assert(subpel_y_qn == 8);
+  assert(filter_params_y->taps == 2);
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+  (void)bd;
+
+  if (w <= 4) {
+    do {
+      uint16x4_t s0 = vld1_u16(src);
+      uint16x4_t s1 = vld1_u16(src + src_stride);
+
+      uint16x4_t d0 = vrhadd_u16(s0, s1);
+
+      if (w == 2) {
+        store_u16_2x1(dst, d0, 0);
+      } else {
+        vst1_u16(dst, d0);
+      }
+
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      const uint16_t *src_ptr = src;
+      uint16_t *dst_ptr = dst;
+      int height = h;
+
+      do {
+        uint16x8_t s0 = vld1q_u16(src_ptr);
+        uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
+
+        uint16x8_t d0 = vrhaddq_u16(s0, s1);
+
+        vst1q_u16(dst_ptr, d0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+      } while (--height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+// Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
+void av1_highbd_convolve_2d_sr_intrabc_neon(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  assert(subpel_x_qn == 8);
+  assert(subpel_y_qn == 8);
+  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+  (void)conv_params;
+  (void)bd;
+
   DECLARE_ALIGNED(16, uint16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t,
-                  im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + 1;
+  int im_stride = MAX_SB_SIZE;
 
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
+  uint16x8_t vert_offset = vdupq_n_u16(1);
 
-  const int im_h = h + filter_params_y->taps - 1;
-  const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
-  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int y_offset_initial = (1 << y_offset_bits);
-  const int y_offset_correction =
-      ((1 << (y_offset_bits - conv_params->round_1)) +
-       (1 << (y_offset_bits - conv_params->round_1 - 1)));
+  uint16_t *im = im_block;
 
-  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
-  // horizontal filter
-  highbd_convolve_x_8tap_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
-                              x_filter_ptr, conv_params, x_offset_initial);
-  // vertical filter
-  if (conv_params->do_average) {
-    highbd_convolve_y_8tap_neon(im_block, im_stride, im_block2, im_stride, w, h,
-                                y_filter_ptr, conv_params, y_offset_initial);
-  } else {
-    highbd_convolve_y_8tap_neon(im_block, im_stride, dst16, dst16_stride, w, h,
-                                y_filter_ptr, conv_params, y_offset_initial);
-  }
-
-  // Do the compound averaging outside the loop, avoids branching within the
-  // main loop
-  if (conv_params->do_average) {
-    if (conv_params->use_dist_wtd_comp_avg) {
-      highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
-                                    conv_params, round_bits,
-                                    y_offset_correction, bd);
-    } else {
-      highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
-                           conv_params, round_bits, y_offset_correction, bd);
-    }
-  }
-}
-
-#define UPSCALE_NORMATIVE_TAPS 8
-
-void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
-                                       uint16_t *dst, int dst_stride, int w,
-                                       int h, const int16_t *x_filters,
-                                       int x0_qn, int x_step_qn, int bd) {
-  const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;
-
-  const int32x4_t idx = { 0, 1, 2, 3 };
-  const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
-  const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
-  const int32x4_t offset_s32 = vdupq_n_s32(0);
-  const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
-
-  const uint16_t *src_ptr = src - horiz_offset;
-  uint16_t *dst_ptr = dst;
-
+  // Horizontal filter.
   if (w <= 4) {
-    int height = h;
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0;
-
-    uint16_t *d = dst_ptr;
     do {
-      int x_qn = x0_qn;
+      uint16x4_t s0 = vld1_u16(src);
+      uint16x4_t s1 = vld1_u16(src + 1);
 
-      // Load 4 src vectors at a time, they might be the same, but we have to
-      // calculate the indices anyway. Doing it in SIMD and then storing the
-      // indices is faster than having to calculate the expression
-      // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
-      // Ideally this should be a gather using the indices, but NEON does not
-      // have that, so have to emulate
-      const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
-      // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
-      // 2
-      const int32x4_t src_idx =
-          vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
-      // Similarly for the filter vector indices, we calculate the filter
-      // indices for 4 columns. First we calculate the indices:
-      // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
-      // Then we calculate the actual pointers, multiplying with
-      // UPSCALE_UPSCALE_NORMATIVE_TAPS
-      // again shift left by 1
-      const int32x4_t x_filter4_idx = vshlq_n_s32(
-          vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
-      // Even though pointers are unsigned 32/64-bit ints we do signed
-      // addition The reason for this is that x_qn can be negative, leading to
-      // negative offsets. Argon test
-      // profile0_core/streams/test10573_11003.obu was failing because of
-      // this.
-#if AOM_ARCH_AARCH64
-      uint64x2_t tmp4[2];
-      tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
-          vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
-      tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
-          vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
-      int16_t *src4_ptr[4];
-      uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
-      vst1q_u64(tmp_ptr, tmp4[0]);
-      vst1q_u64(tmp_ptr + 2, tmp4[1]);
+      uint16x4_t d0 = vadd_u16(s0, s1);
 
-      // filter vectors
-      tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
-          vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
-          vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
-      tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
-          vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
-          vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+      // Safe to store the whole vector, the im buffer is big enough.
+      vst1_u16(im, d0);
 
-      const int16_t *x_filter4_ptr[4];
-      tmp_ptr = (uint64_t *)&x_filter4_ptr;
-      vst1q_u64(tmp_ptr, tmp4[0]);
-      vst1q_u64(tmp_ptr + 2, tmp4[1]);
-#else
-      uint32x4_t tmp4;
-      tmp4 = vreinterpretq_u32_s32(
-          vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
-      int16_t *src4_ptr[4];
-      uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
-      vst1q_u32(tmp_ptr, tmp4);
-      // filter vectors
-      tmp4 = vreinterpretq_u32_s32(
-          vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
-                    vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
-
-      const int16_t *x_filter4_ptr[4];
-      tmp_ptr = (uint32_t *)&x_filter4_ptr;
-      vst1q_u32(tmp_ptr, tmp4);
-#endif  // AOM_ARCH_AARCH64
-      // Load source
-      s0 = vld1q_s16(src4_ptr[0]);
-      s1 = vld1q_s16(src4_ptr[1]);
-      s2 = vld1q_s16(src4_ptr[2]);
-      s3 = vld1q_s16(src4_ptr[3]);
-
-      // Actually load the filters
-      const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
-      const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
-      const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
-      const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
-
-      // Group low and high parts and transpose
-      int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
-                                 vget_low_s16(x_filter1),
-                                 vget_low_s16(x_filter2),
-                                 vget_low_s16(x_filter3) };
-      int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
-                                 vget_high_s16(x_filter1),
-                                 vget_high_s16(x_filter2),
-                                 vget_high_s16(x_filter3) };
-      transpose_u16_4x4((uint16x4_t *)filters_lo);
-      transpose_u16_4x4((uint16x4_t *)filters_hi);
-
-      // Run the 2D Scale convolution
-      d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
-          s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
-
-      d0 = vmin_u16(d0, max);
-
-      if (w == 2) {
-        store_u16_2x1(d + 0 * dst_stride, d0, 0);
-      } else {
-        vst1_u16(d + 0 * dst_stride, d0);
-      }
-
-      src_ptr += src_stride;
-      d += dst_stride;
-      height--;
-    } while (height > 0);
+      src += src_stride;
+      im += im_stride;
+    } while (--im_h != 0);
   } else {
-    int height = h;
-    int16x8_t s0, s1, s2, s3;
-    uint16x4_t d0;
-
     do {
+      const uint16_t *src_ptr = src;
+      uint16_t *im_ptr = im;
       int width = w;
-      int x_qn = x0_qn;
-      uint16_t *d = dst_ptr;
-      const uint16_t *s = src_ptr;
 
       do {
-        // Load 4 src vectors at a time, they might be the same, but we have to
-        // calculate the indices anyway. Doing it in SIMD and then storing the
-        // indices is faster than having to calculate the expression
-        // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
-        // Ideally this should be a gather using the indices, but NEON does not
-        // have that, so have to emulate
-        const int32x4_t xqn_idx =
-            vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
-        // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
-        // = 2
-        const int32x4_t src_idx =
-            vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+        uint16x8_t s0 = vld1q_u16(src_ptr);
+        uint16x8_t s1 = vld1q_u16(src_ptr + 1);
 
-        // Similarly for the filter vector indices, we calculate the filter
-        // indices for 4 columns. First we calculate the indices:
-        // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
-        // Then we calculate the actual pointers, multiplying with
-        // UPSCALE_UPSCALE_NORMATIVE_TAPS
-        // again shift left by 1
-        const int32x4_t x_filter4_idx = vshlq_n_s32(
-            vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
-            1);
-        // Even though pointers are unsigned 32/64-bit ints we do signed
-        // addition The reason for this is that x_qn can be negative, leading to
-        // negative offsets. Argon test
-        // profile0_core/streams/test10573_11003.obu was failing because of
-        // this.
-#if AOM_ARCH_AARCH64
-        uint64x2_t tmp4[2];
-        tmp4[0] = vreinterpretq_u64_s64(
-            vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
-        tmp4[1] = vreinterpretq_u64_s64(
-            vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
-        int16_t *src4_ptr[4];
-        uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
-        vst1q_u64(tmp_ptr, tmp4[0]);
-        vst1q_u64(tmp_ptr + 2, tmp4[1]);
+        uint16x8_t d0 = vaddq_u16(s0, s1);
 
-        // filter vectors
-        tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
-            vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
-            vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
-        tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
-            vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
-            vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+        vst1q_u16(im_ptr, d0);
 
-        const int16_t *x_filter4_ptr[4];
-        tmp_ptr = (uint64_t *)&x_filter4_ptr;
-        vst1q_u64(tmp_ptr, tmp4[0]);
-        vst1q_u64(tmp_ptr + 2, tmp4[1]);
-#else
-        uint32x4_t tmp4;
-        tmp4 = vreinterpretq_u32_s32(
-            vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
-        int16_t *src4_ptr[4];
-        uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
-        vst1q_u32(tmp_ptr, tmp4);
-        // filter vectors
-        tmp4 = vreinterpretq_u32_s32(
-            vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
-                      vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+        src_ptr += 8;
+        im_ptr += 8;
+        width -= 8;
+      } while (width != 0);
+      src += src_stride;
+      im += im_stride;
+    } while (--im_h != 0);
+  }
 
-        const int16_t *x_filter4_ptr[4];
-        tmp_ptr = (uint32_t *)&x_filter4_ptr;
-        vst1q_u32(tmp_ptr, tmp4);
-#endif  // AOM_ARCH_AARCH64
+  im = im_block;
 
-        // Load source
-        s0 = vld1q_s16(src4_ptr[0]);
-        s1 = vld1q_s16(src4_ptr[1]);
-        s2 = vld1q_s16(src4_ptr[2]);
-        s3 = vld1q_s16(src4_ptr[3]);
+  // Vertical filter.
+  if (w <= 4) {
+    do {
+      uint16x4_t s0 = vld1_u16(im);
+      uint16x4_t s1 = vld1_u16(im + im_stride);
 
-        // Actually load the filters
-        const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
-        const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
-        const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
-        const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+      uint16x4_t d0 = vhadd_u16(s0, s1);
+      d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
 
-        // Group low and high parts and transpose
-        int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
-                                   vget_low_s16(x_filter1),
-                                   vget_low_s16(x_filter2),
-                                   vget_low_s16(x_filter3) };
-        int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
-                                   vget_high_s16(x_filter1),
-                                   vget_high_s16(x_filter2),
-                                   vget_high_s16(x_filter3) };
-        transpose_u16_4x4((uint16x4_t *)filters_lo);
-        transpose_u16_4x4((uint16x4_t *)filters_hi);
+      if (w == 2) {
+        store_u16_2x1(dst, d0, 0);
+      } else {
+        vst1_u16(dst, d0);
+      }
 
-        // Run the 2D Scale X convolution
-        d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
-            s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+      im += im_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      uint16_t *im_ptr = im;
+      uint16_t *dst_ptr = dst;
+      int height = h;
 
-        d0 = vmin_u16(d0, max);
-        vst1_u16(d, d0);
+      do {
+        uint16x8_t s0 = vld1q_u16(im_ptr);
+        uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
 
-        x_qn += 4 * x_step_qn;
-        d += 4;
-        width -= 4;
-      } while (width > 0);
+        uint16x8_t d0 = vhaddq_u16(s0, s1);
+        d0 = vhaddq_u16(d0, vert_offset);
 
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      height--;
-    } while (height > 0);
+        vst1q_u16(dst_ptr, d0);
+
+        im_ptr += im_stride;
+        dst_ptr += dst_stride;
+      } while (--height != 0);
+      im += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
diff --git a/av1/common/arm/highbd_convolve_neon.h b/av1/common/arm/highbd_convolve_neon.h
index f9d028f..08b2bda 100644
--- a/av1/common/arm/highbd_convolve_neon.h
+++ b/av1/common/arm/highbd_convolve_neon.h
@@ -14,68 +14,9 @@
 
 #include <arm_neon.h>
 
-static INLINE int32x4_t highbd_convolve6_4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x8_t y_filter, const int32x4_t offset) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
-  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 1);
-  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
-  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
-  sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
-  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
-  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
-
-  return sum;
-}
-
-static INLINE uint16x4_t highbd_convolve6_4_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x8_t y_filter, const int32x4_t offset) {
-  int32x4_t sum =
-      highbd_convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, offset);
-
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE void highbd_convolve6_8_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t y_filter, const int32x4_t offset, int32x4_t *sum0,
-    int32x4_t *sum1) {
-  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
-  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
-  *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 3);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_hi, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 2);
-
-  *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 3);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_hi, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 2);
-}
-
-static INLINE uint16x8_t highbd_convolve6_8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t y_filter, const int32x4_t offset) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, offset, &sum0,
-                         &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/convolve.h"
 
 static INLINE int32x4_t highbd_convolve8_4_s32(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
@@ -97,18 +38,7 @@
   return sum;
 }
 
-static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
-    const int32x4_t offset) {
-  int32x4_t sum =
-      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset);
-
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x4_t highbd_convolve8_sr_4_s32_s16(
+static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -120,23 +50,8 @@
   return vqmovun_s32(sum);
 }
 
-static INLINE uint16x4_t highbd_convolve8_wtd_4_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
-    const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
-    const int32x4_t offset2) {
-  int32x4_t sum =
-      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset);
-
-  sum = vqrshlq_s32(sum, shift_s32);
-  sum = vmlaq_s32(offset2, sum, weight);
-
-  return vqmovun_s32(sum);
-}
-
 // Like above but also perform round shifting and subtract correction term
-static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
+static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
@@ -176,41 +91,8 @@
   *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
 }
 
-static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
-    const int32x4_t offset) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset,
-                         &sum0, &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE uint16x8_t highbd_convolve8_wtd_8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
-    const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
-    const int32x4_t offset2) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset,
-                         &sum0, &sum1);
-
-  sum0 = vqrshlq_s32(sum0, shift_s32);
-  sum1 = vqrshlq_s32(sum1, shift_s32);
-  sum0 = vmlaq_s32(offset2, sum0, weight);
-  sum1 = vmlaq_s32(offset2, sum1, weight);
-
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
 // Like above but also perform round shifting and subtract correction term
-static INLINE uint16x8_t highbd_convolve8_8_sr_s32_s16(
+static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
@@ -227,290 +109,6 @@
   return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
 }
 
-static INLINE int32x4_t highbd_convolve12_y_4_s32(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t offset) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-
-  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
-  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
-  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
-  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
-  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
-  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
-  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
-  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
-  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
-  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
-  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
-  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
-
-  return sum;
-}
-
-static INLINE uint16x4_t highbd_convolve12_y_4_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t offset) {
-  int32x4_t sum =
-      highbd_convolve12_y_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                s11, y_filter_0_7, y_filter_8_11, offset);
-
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-// Like above but also perform round shifting and subtract correction term
-static INLINE uint16x4_t highbd_convolve12_y_4_sr_s32_s16(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
-    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t round_shift, const int32x4_t offset,
-    const int32x4_t correction) {
-  int32x4_t sum =
-      highbd_convolve12_y_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
-                                s11, y_filter_0_7, y_filter_8_11, offset);
-
-  sum = vsubq_s32(vqrshlq_s32(sum, round_shift), correction);
-  return vqmovun_s32(sum);
-}
-
-static INLINE void highbd_convolve12_y_8_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
-    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
-
-  *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_0_3, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_0_3, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_0_3, 3);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_4_7, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_4_7, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_4_7, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_4_7, 3);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s8), y_filter_8_11, 0);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s9), y_filter_8_11, 1);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s10), y_filter_8_11, 2);
-  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s11), y_filter_8_11, 3);
-
-  *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_0_3, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_0_3, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_0_3, 3);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_4_7, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_4_7, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_4_7, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_4_7, 3);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s8), y_filter_8_11, 0);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s9), y_filter_8_11, 1);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s10), y_filter_8_11, 2);
-  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s11), y_filter_8_11, 3);
-}
-
-static INLINE uint16x8_t highbd_convolve12_y_8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
-    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t offset) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve12_y_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                            y_filter_0_7, y_filter_8_11, offset, &sum0, &sum1);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-// Like above but also perform round shifting and subtract correction term
-static INLINE uint16x8_t highbd_convolve12_y_8_sr_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
-    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
-    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
-    const int32x4_t round_shift, const int32x4_t offset,
-    const int32x4_t correction) {
-  int32x4_t sum0;
-  int32x4_t sum1;
-  highbd_convolve12_y_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                            y_filter_0_7, y_filter_8_11, offset, &sum0, &sum1);
-
-  sum0 = vsubq_s32(vqrshlq_s32(sum0, round_shift), correction);
-  sum1 = vsubq_s32(vqrshlq_s32(sum1, round_shift), correction);
-
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
-static INLINE int32x4_t highbd_convolve8_horiz4_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int32x4_t offset) {
-  const int16x8_t s2 = vextq_s16(s0, s1, 1);
-  const int16x8_t s3 = vextq_s16(s0, s1, 2);
-  const int16x8_t s4 = vextq_s16(s0, s1, 3);
-  const int16x4_t s0_lo = vget_low_s16(s0);
-  const int16x4_t s1_lo = vget_low_s16(s2);
-  const int16x4_t s2_lo = vget_low_s16(s3);
-  const int16x4_t s3_lo = vget_low_s16(s4);
-  const int16x4_t s4_lo = vget_high_s16(s0);
-  const int16x4_t s5_lo = vget_high_s16(s2);
-  const int16x4_t s6_lo = vget_high_s16(s3);
-  const int16x4_t s7_lo = vget_high_s16(s4);
-
-  return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
-                                s7_lo, x_filter_0_7, offset);
-}
-
-static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int32x4_t shift_s32, const int32x4_t offset) {
-  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7, offset);
-
-  sum = vqrshlq_s32(sum, shift_s32);
-  return vqmovun_s32(sum);
-}
-
-static INLINE uint16x4_t highbd_convolve8_wtd_horiz4_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
-    const int32x4_t offset2) {
-  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7, offset);
-
-  sum = vqrshlq_s32(sum, shift_s32);
-  sum = vmlaq_s32(offset2, sum, weight);
-  return vqmovun_s32(sum);
-}
-
-static INLINE void highbd_convolve8_horiz8_s32(
-    const int16x8_t s0, const int16x8_t s0_hi, const int16x8_t x_filter_0_7,
-    const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
-  const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
-  const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
-  const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
-  const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
-  const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
-  const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
-  const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
-
-  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, offset,
-                         sum0, sum1);
-}
-
-static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int32x4_t shift_s32, const int32x4_t offset) {
-  int32x4_t sum0, sum1;
-  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, offset, &sum0, &sum1);
-
-  sum0 = vqrshlq_s32(sum0, shift_s32);
-  sum1 = vqrshlq_s32(sum1, shift_s32);
-
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
-static INLINE uint16x8_t highbd_convolve8_wtd_horiz8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int32x4_t shift_s32, const int32x4_t offset, const int32x4_t weight,
-    const int32x4_t offset2) {
-  int32x4_t sum0, sum1;
-  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, offset, &sum0, &sum1);
-
-  sum0 = vqrshlq_s32(sum0, shift_s32);
-  sum1 = vqrshlq_s32(sum1, shift_s32);
-  sum0 = vmlaq_s32(offset2, sum0, weight);
-  sum1 = vmlaq_s32(offset2, sum1, weight);
-
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
-static INLINE int32x4_t highbd_convolve12_horiz4_s32(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11, const int32x4_t offset) {
-  const int16x8_t s2 = vextq_s16(s0, s1, 1);
-  const int16x8_t s3 = vextq_s16(s0, s1, 2);
-  const int16x8_t s4 = vextq_s16(s0, s1, 3);
-  const int16x8_t s5 = vextq_s16(s0, s1, 4);
-  const int16x8_t s6 = vextq_s16(s0, s1, 5);
-  const int16x8_t s7 = vextq_s16(s0, s1, 6);
-  const int16x8_t s8 = vextq_s16(s0, s1, 7);
-  const int16x4_t s0_lo = vget_low_s16(s0);
-  const int16x4_t s1_lo = vget_low_s16(s2);
-  const int16x4_t s2_lo = vget_low_s16(s3);
-  const int16x4_t s3_lo = vget_low_s16(s4);
-  const int16x4_t s4_lo = vget_high_s16(s0);
-  const int16x4_t s5_lo = vget_high_s16(s2);
-  const int16x4_t s6_lo = vget_high_s16(s3);
-  const int16x4_t s7_lo = vget_high_s16(s4);
-  const int16x4_t s8_lo = vget_high_s16(s5);
-  const int16x4_t s9_lo = vget_high_s16(s6);
-  const int16x4_t s10_lo = vget_high_s16(s7);
-  const int16x4_t s11_lo = vget_high_s16(s8);
-
-  return highbd_convolve12_y_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo,
-                                   s6_lo, s7_lo, s8_lo, s9_lo, s10_lo, s11_lo,
-                                   x_filter_0_7, x_filter_8_11, offset);
-}
-
-static INLINE uint16x4_t highbd_convolve12_horiz4_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7,
-    const int16x4_t x_filter_8_11, const int32x4_t shift_s32,
-    const int32x4_t offset) {
-  int32x4_t sum =
-      highbd_convolve12_horiz4_s32(s0, s1, x_filter_0_7, x_filter_8_11, offset);
-
-  sum = vqrshlq_s32(sum, shift_s32);
-  return vqmovun_s32(sum);
-}
-
-static INLINE void highbd_convolve12_horiz8_s32(
-    const int16x8_t s0_0, const int16x8_t s0_1, const int16x8_t s0_2,
-    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
-    const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
-  const int16x8_t s1 = vextq_s16(s0_0, s0_1, 1);
-  const int16x8_t s2 = vextq_s16(s0_0, s0_1, 2);
-  const int16x8_t s3 = vextq_s16(s0_0, s0_1, 3);
-  const int16x8_t s4 = vextq_s16(s0_0, s0_1, 4);
-  const int16x8_t s5 = vextq_s16(s0_0, s0_1, 5);
-  const int16x8_t s6 = vextq_s16(s0_0, s0_1, 6);
-  const int16x8_t s7 = vextq_s16(s0_0, s0_1, 7);
-  const int16x8_t s8 = s0_1;
-  const int16x8_t s9 = vextq_s16(s0_1, s0_2, 1);
-  const int16x8_t s10 = vextq_s16(s0_1, s0_2, 2);
-  const int16x8_t s11 = vextq_s16(s0_1, s0_2, 3);
-
-  highbd_convolve12_y_8_s32(s0_0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-                            x_filter_0_7, x_filter_8_11, offset, sum0, sum1);
-}
-
-static INLINE uint16x8_t highbd_convolve12_horiz8_s32_s16(
-    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
-    const int32x4_t shift_s32, const int32x4_t offset) {
-  int32x4_t sum0, sum1;
-  highbd_convolve12_horiz8_s32(s0, s1, s2, x_filter_0_7, x_filter_8_11, offset,
-                               &sum0, &sum1);
-
-  sum0 = vqrshlq_s32(sum0, shift_s32);
-  sum1 = vqrshlq_s32(sum1, shift_s32);
-
-  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
-}
-
 static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x4_t *filters_lo,
@@ -520,8 +118,8 @@
   int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2),
                        vget_high_s16(s3) };
 
-  transpose_u16_4x4((uint16x4_t *)s_lo);
-  transpose_u16_4x4((uint16x4_t *)s_hi);
+  transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo);
+  transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi);
 
   int32x4_t sum = vmlal_s16(offset, s_lo[0], filters_lo[0]);
   sum = vmlal_s16(sum, s_lo[1], filters_lo[1]);
diff --git a/av1/common/arm/highbd_convolve_scale_neon.c b/av1/common/arm/highbd_convolve_scale_neon.c
new file mode 100644
index 0000000..eee5a1c
--- /dev/null
+++ b/av1/common/arm/highbd_convolve_scale_neon.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, ConvolveParams *conv_params, const int round_bits,
+    const int offset, const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const int32x4_t round_shift = vdupq_n_s32(-round_bits);
+  const uint32x4_t offset_vec = vdupq_n_u32(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+  // Weighted averaging
+  if (w <= 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+      d0 = vqrshlq_s32(d0, round_shift);
+
+      uint16x4_t d0_u16 = vqmovun_s32(d0);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      if (w == 2) {
+        store_u16_2x1(dst_ptr, d0_u16, 0);
+      } else {
+        vst1_u16(dst_ptr, d0_u16);
+      }
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+        d0 = vqrshlq_s32(d0, round_shift);
+
+        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+        d1 = vqrshlq_s32(d1, round_shift);
+
+        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
+        d01 = vminq_u16(d01, max);
+        vst1q_u16(dst, d01);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      ref_ptr += ref_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+                                        uint16_t *dst_ptr, int dst_stride,
+                                        int w, int h,
+                                        ConvolveParams *conv_params,
+                                        const int round_bits, const int offset,
+                                        const int bd) {
+  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+  const int ref_stride = conv_params->dst_stride;
+  const int32x4_t round_shift = vdupq_n_s32(-round_bits);
+  const uint16x4_t offset_vec = vdup_n_u16(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w <= 4) {
+    do {
+      const uint16x4_t src = vld1_u16(src_ptr);
+      const uint16x4_t ref = vld1_u16(ref_ptr);
+
+      uint16x4_t avg = vhadd_u16(src, ref);
+      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+      d0 = vqrshlq_s32(d0, round_shift);
+
+      uint16x4_t d0_u16 = vqmovun_s32(d0);
+      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+      if (w == 2) {
+        store_u16_2x1(dst_ptr, d0_u16, 0);
+      } else {
+        vst1_u16(dst_ptr, d0_u16);
+      }
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *src = src_ptr;
+      const uint16_t *ref = ref_ptr;
+      uint16_t *dst = dst_ptr;
+      do {
+        const uint16x8_t s = vld1q_u16(src);
+        const uint16x8_t r = vld1q_u16(ref);
+
+        uint16x8_t avg = vhaddq_u16(s, r);
+        int32x4_t d0_lo =
+            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+        int32x4_t d0_hi =
+            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+        d0_lo = vqrshlq_s32(d0_lo, round_shift);
+        d0_hi = vqrshlq_s32(d0_hi, round_shift);
+
+        uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi));
+        d0 = vminq_u16(d0, max);
+        vst1q_u16(dst, d0);
+
+        src += 8;
+        ref += 8;
+        dst += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int subpel_x_qn, const int x_step_qn,
+    const InterpFilterParams *filter_params, ConvolveParams *conv_params,
+    const int offset) {
+  static const uint32_t kIdx[4] = { 0, 1, 2, 3 };
+  const uint32x4_t idx = vld1q_u32(kIdx);
+  const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK);
+  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+  const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+  if (w <= 4) {
+    int height = h;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int x_qn = subpel_x_qn;
+
+      // Load 4 src vectors at a time, they might be the same, but we have to
+      // calculate the indices anyway. Doing it in SIMD and then storing the
+      // indices is faster than having to calculate the expression
+      // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
+      // Ideally this should be a gather using the indices, but NEON does not
+      // have that, so have to emulate
+      const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
+      // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
+      // 2
+      const uint32x4_t src_idx_u32 =
+          vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
+#if AOM_ARCH_AARCH64
+      uint64x2_t src4[2];
+      src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
+                          vget_low_u32(src_idx_u32));
+      src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
+                          vget_high_u32(src_idx_u32));
+      int16_t *src4_ptr[4];
+      uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+      vst1q_u64(tmp_ptr, src4[0]);
+      vst1q_u64(tmp_ptr + 2, src4[1]);
+#else
+      uint32x4_t src4;
+      src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32);
+      int16_t *src4_ptr[4];
+      uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+      vst1q_u32(tmp_ptr, src4);
+#endif  // AOM_ARCH_AARCH64
+      // Same for the filter vectors
+      const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
+          vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
+      int32_t x_filter4_idx[4];
+      vst1q_s32(x_filter4_idx, filter_idx_s32);
+      const int16_t *x_filter4_ptr[4];
+
+      // Load source
+      int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+      int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+      int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+      int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+      // We could easily do this using SIMD as well instead of calling the
+      // inline function 4 times.
+      x_filter4_ptr[0] =
+          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]);
+      x_filter4_ptr[1] =
+          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]);
+      x_filter4_ptr[2] =
+          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]);
+      x_filter4_ptr[3] =
+          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]);
+
+      // Actually load the filters
+      const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+      const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+      const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+      const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+      // Group low and high parts and transpose
+      int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+                                 vget_low_s16(x_filter1),
+                                 vget_low_s16(x_filter2),
+                                 vget_low_s16(x_filter3) };
+      int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+                                 vget_high_s16(x_filter1),
+                                 vget_high_s16(x_filter2),
+                                 vget_high_s16(x_filter3) };
+      transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+      transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+      // Run the 2D Scale convolution
+      uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+          s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+      if (w == 2) {
+        store_u16_2x1(d + 0 * dst_stride, d0, 0);
+      } else {
+        vst1_u16(d + 0 * dst_stride, d0);
+      }
+
+      src_ptr += src_stride;
+      d += dst_stride;
+      height--;
+    } while (height > 0);
+  } else {
+    int height = h;
+
+    do {
+      int width = w;
+      int x_qn = subpel_x_qn;
+      uint16_t *d = dst_ptr;
+      const uint16_t *s = src_ptr;
+
+      do {
+        // Load 4 src vectors at a time, they might be the same, but we have to
+        // calculate the indices anyway. Doing it in SIMD and then storing the
+        // indices is faster than having to calculate the expression
+        // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
+        // Ideally this should be a gather using the indices, but NEON does not
+        // have that, so have to emulate
+        const uint32x4_t xqn_idx =
+            vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
+        // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
+        // = 2
+        const uint32x4_t src_idx_u32 =
+            vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
+#if AOM_ARCH_AARCH64
+        uint64x2_t src4[2];
+        src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
+                            vget_low_u32(src_idx_u32));
+        src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
+                            vget_high_u32(src_idx_u32));
+        int16_t *src4_ptr[4];
+        uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+        vst1q_u64(tmp_ptr, src4[0]);
+        vst1q_u64(tmp_ptr + 2, src4[1]);
+#else
+        uint32x4_t src4;
+        src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32);
+        int16_t *src4_ptr[4];
+        uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+        vst1q_u32(tmp_ptr, src4);
+#endif  // AOM_ARCH_AARCH64
+        // Same for the filter vectors
+        const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
+            vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
+        int32_t x_filter4_idx[4];
+        vst1q_s32(x_filter4_idx, filter_idx_s32);
+        const int16_t *x_filter4_ptr[4];
+
+        // Load source
+        int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+        int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+        int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+        int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+        // We could easily do this using SIMD as well instead of calling the
+        // inline function 4 times.
+        x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(
+            filter_params, x_filter4_idx[0]);
+        x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(
+            filter_params, x_filter4_idx[1]);
+        x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(
+            filter_params, x_filter4_idx[2]);
+        x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(
+            filter_params, x_filter4_idx[3]);
+
+        // Actually load the filters
+        const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+        const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+        const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+        const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+        // Group low and high parts and transpose
+        int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+                                   vget_low_s16(x_filter1),
+                                   vget_low_s16(x_filter2),
+                                   vget_low_s16(x_filter3) };
+        int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+                                   vget_high_s16(x_filter1),
+                                   vget_high_s16(x_filter2),
+                                   vget_high_s16(x_filter3) };
+        transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+        transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+        // Run the 2D Scale X convolution
+        uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+            s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+        vst1_u16(d, d0);
+
+        x_qn += 4 * x_step_qn;
+        d += 4;
+        width -= 4;
+      } while (width > 0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      height--;
+    } while (height > 0);
+  }
+}
+
+static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int subpel_y_qn, const int y_step_qn,
+    const InterpFilterParams *filter_params, const int round1_bits,
+    const int offset) {
+  const int32x4_t offset_s32 = vdupq_n_s32(1 << offset);
+
+  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits);
+  if (w <= 4) {
+    int height = h;
+    uint16_t *d = dst_ptr;
+    int y_qn = subpel_y_qn;
+
+    do {
+      const int16_t *s =
+          (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      const int16_t *y_filter_ptr =
+          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
+      const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+      uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16(
+          s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
+          offset_s32, vdupq_n_s32(0));
+
+      if (w == 2) {
+        store_u16_2x1(d, d0, 0);
+      } else {
+        vst1_u16(d, d0);
+      }
+
+      y_qn += y_step_qn;
+      d += dst_stride;
+      height--;
+    } while (height > 0);
+  } else {
+    int width = w;
+
+    do {
+      int height = h;
+      int y_qn = subpel_y_qn;
+
+      uint16_t *d = dst_ptr;
+
+      do {
+        const int16_t *s =
+            (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+        const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+        const int16_t *y_filter_ptr =
+            av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
+        const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+        uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16(
+            s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
+            offset_s32, vdupq_n_s32(0));
+        vst1q_u16(d, d0);
+
+        y_qn += y_step_qn;
+        d += dst_stride;
+        height--;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+static INLINE void highbd_convolve_correct_offset_neon(
+    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+    int w, int h, const int round_bits, const int offset, const int bd) {
+  const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
+  const int16x4_t offset_s16 = vdup_n_s16(offset);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w <= 4) {
+    for (int y = 0; y < h; ++y) {
+      const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride);
+      const int32x4_t d0 =
+          vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32);
+      uint16x4_t d = vqmovun_s32(d0);
+      d = vmin_u16(d, vget_low_u16(max));
+      if (w == 2) {
+        store_u16_2x1(dst_ptr + y * dst_stride, d, 0);
+      } else {
+        vst1_u16(dst_ptr + y * dst_stride, d);
+      }
+    }
+  } else {
+    for (int y = 0; y < h; ++y) {
+      for (int x = 0; x < w; x += 8) {
+        // Subtract round offset and convolve round
+        const int16x8_t s =
+            vld1q_s16((const int16_t *)src_ptr + y * src_stride + x);
+        const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16),
+                                         round_shift_s32);
+        const int32x4_t d1 = vqrshlq_s32(
+            vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32);
+        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
+        d01 = vminq_u16(d01, max);
+        vst1q_u16(dst_ptr + y * dst_stride + x, d01);
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_scale_neon(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params, int bd) {
+  uint16_t *im_block = (uint16_t *)aom_memalign(
+      16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
+  if (!im_block) return;
+  uint16_t *im_block2 = (uint16_t *)aom_memalign(
+      16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
+  if (!im_block2) {
+    aom_free(im_block);  // free the first block and return.
+    return;
+  }
+
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  const int im_stride = MAX_SB_SIZE;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int x_offset_bits = (1 << (bd + FILTER_BITS - 1));
+  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int y_offset_correction =
+      ((1 << (y_offset_bits - conv_params->round_1)) +
+       (1 << (y_offset_bits - conv_params->round_1 - 1)));
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+
+  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  highbd_convolve_2d_x_scale_8tap_neon(
+      src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn,
+      filter_params_x, conv_params, x_offset_bits);
+  if (conv_params->is_compound && !conv_params->do_average) {
+    highbd_convolve_2d_y_scale_8tap_neon(
+        im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn,
+        filter_params_y, conv_params->round_1, y_offset_bits);
+  } else {
+    highbd_convolve_2d_y_scale_8tap_neon(
+        im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn,
+        filter_params_y, conv_params->round_1, y_offset_bits);
+  }
+
+  // Do the compound averaging outside the loop, avoids branching within the
+  // main loop
+  if (conv_params->is_compound) {
+    if (conv_params->do_average) {
+      if (conv_params->use_dist_wtd_comp_avg) {
+        highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+                                      h, conv_params, bits, y_offset_correction,
+                                      bd);
+      } else {
+        highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+                             conv_params, bits, y_offset_correction, bd);
+      }
+    }
+  } else {
+    highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride,
+                                        w, h, bits, y_offset_correction, bd);
+  }
+  aom_free(im_block);
+  aom_free(im_block2);
+}
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
index d197fca..84bc8fd 100644
--- a/av1/common/arm/highbd_inv_txfm_neon.c
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -590,7 +590,7 @@
                           int bd, int out_shift) {
   const int32_t *sinpi = sinpi_arr(bit);
   const int32x4_t zero = vdupq_n_s32(0);
-  int64x2_t rnding = vdupq_n_s64(1 << (bit + 4 - 1));
+  int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1));
   const int32x2_t mul = vdup_n_s32(1 << 4);
   int32x4_t t;
   int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
diff --git a/av1/common/arm/highbd_reconinter_neon.c b/av1/common/arm/highbd_reconinter_neon.c
new file mode 100644
index 0000000..573d3c1
--- /dev/null
+++ b/av1/common/arm/highbd_reconinter_neon.c
@@ -0,0 +1,330 @@
+/*
+ *
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+
+static INLINE void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse,
+                                            const uint16_t *src0,
+                                            int src0_stride,
+                                            const uint16_t *src1,
+                                            int src1_stride, int h, int w,
+                                            const unsigned int bd) {
+  assert(DIFF_FACTOR > 0);
+  uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+  uint8x16_t mask_base = vdupq_n_u8(38);
+  uint8x16_t mask_diff = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA - 38);
+
+  if (bd == 8) {
+    if (w >= 16) {
+      do {
+        uint8_t *mask_ptr = mask;
+        const uint16_t *src0_ptr = src0;
+        const uint16_t *src1_ptr = src1;
+        int width = w;
+        do {
+          uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+          uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+          uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+          uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+          uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+          uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+          uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
+          uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
+          uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+          uint8x16_t m;
+          if (inverse) {
+            m = vqsubq_u8(mask_diff, diff);
+          } else {
+            m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+          }
+
+          vst1q_u8(mask_ptr, m);
+
+          src0_ptr += 16;
+          src1_ptr += 16;
+          mask_ptr += 16;
+          width -= 16;
+        } while (width != 0);
+        mask += w;
+        src0 += src0_stride;
+        src1 += src1_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8_t *mask_ptr = mask;
+        const uint16_t *src0_ptr = src0;
+        const uint16_t *src1_ptr = src1;
+        int width = w;
+        do {
+          uint16x8_t s0 = vld1q_u16(src0_ptr);
+          uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+          uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+          uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+          uint8x8_t m;
+          if (inverse) {
+            m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+          } else {
+            m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+                        vget_low_u8(max_alpha));
+          }
+
+          vst1_u8(mask_ptr, m);
+
+          src0_ptr += 8;
+          src1_ptr += 8;
+          mask_ptr += 8;
+          width -= 8;
+        } while (width != 0);
+        mask += w;
+        src0 += src0_stride;
+        src1 += src1_stride;
+      } while (--h != 0);
+    } else if (w == 4) {
+      do {
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+        uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+        uint8x8_t m;
+        if (inverse) {
+          m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+        } else {
+          m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+                      vget_low_u8(max_alpha));
+        }
+
+        store_u8_4x1(mask, m, 0);
+        store_u8_4x1(mask + w, m, 1);
+
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        mask += 2 * w;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (bd == 10) {
+    if (w >= 16) {
+      do {
+        uint8_t *mask_ptr = mask;
+        const uint16_t *src0_ptr = src0;
+        const uint16_t *src1_ptr = src1;
+        int width = w;
+        do {
+          uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+          uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+          uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+          uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+          uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+          uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+          uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 2 + DIFF_FACTOR_LOG2);
+          uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 2 + DIFF_FACTOR_LOG2);
+          uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+          uint8x16_t m;
+          if (inverse) {
+            m = vqsubq_u8(mask_diff, diff);
+          } else {
+            m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+          }
+
+          vst1q_u8(mask_ptr, m);
+
+          src0_ptr += 16;
+          src1_ptr += 16;
+          mask_ptr += 16;
+          width -= 16;
+        } while (width != 0);
+        mask += w;
+        src0 += src0_stride;
+        src1 += src1_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8_t *mask_ptr = mask;
+        const uint16_t *src0_ptr = src0;
+        const uint16_t *src1_ptr = src1;
+        int width = w;
+        do {
+          uint16x8_t s0 = vld1q_u16(src0_ptr);
+          uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+          uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+          uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2);
+          uint8x8_t m;
+          if (inverse) {
+            m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+          } else {
+            m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+                        vget_low_u8(max_alpha));
+          }
+
+          vst1_u8(mask_ptr, m);
+
+          src0_ptr += 8;
+          src1_ptr += 8;
+          mask_ptr += 8;
+          width -= 8;
+        } while (width != 0);
+        mask += w;
+        src0 += src0_stride;
+        src1 += src1_stride;
+      } while (--h != 0);
+    } else if (w == 4) {
+      do {
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+        uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2);
+        uint8x8_t m;
+        if (inverse) {
+          m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+        } else {
+          m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+                      vget_low_u8(max_alpha));
+        }
+
+        store_u8_4x1(mask, m, 0);
+        store_u8_4x1(mask + w, m, 1);
+
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        mask += 2 * w;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    assert(bd == 12);
+    if (w >= 16) {
+      do {
+        uint8_t *mask_ptr = mask;
+        const uint16_t *src0_ptr = src0;
+        const uint16_t *src1_ptr = src1;
+        int width = w;
+        do {
+          uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+          uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+          uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+          uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+          uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+          uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+          uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 4 + DIFF_FACTOR_LOG2);
+          uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 4 + DIFF_FACTOR_LOG2);
+          uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+          uint8x16_t m;
+          if (inverse) {
+            m = vqsubq_u8(mask_diff, diff);
+          } else {
+            m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+          }
+
+          vst1q_u8(mask_ptr, m);
+
+          src0_ptr += 16;
+          src1_ptr += 16;
+          mask_ptr += 16;
+          width -= 16;
+        } while (width != 0);
+        mask += w;
+        src0 += src0_stride;
+        src1 += src1_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8_t *mask_ptr = mask;
+        const uint16_t *src0_ptr = src0;
+        const uint16_t *src1_ptr = src1;
+        int width = w;
+        do {
+          uint16x8_t s0 = vld1q_u16(src0_ptr);
+          uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+          uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+          uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2);
+          uint8x8_t m;
+          if (inverse) {
+            m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+          } else {
+            m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+                        vget_low_u8(max_alpha));
+          }
+
+          vst1_u8(mask_ptr, m);
+
+          src0_ptr += 8;
+          src1_ptr += 8;
+          mask_ptr += 8;
+          width -= 8;
+        } while (width != 0);
+        mask += w;
+        src0 += src0_stride;
+        src1 += src1_stride;
+      } while (--h != 0);
+    } else if (w == 4) {
+      do {
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+        uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2);
+        uint8x8_t m;
+        if (inverse) {
+          m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+        } else {
+          m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+                      vget_low_u8(max_alpha));
+        }
+
+        store_u8_4x1(mask, m, 0);
+        store_u8_4x1(mask + w, m, 1);
+
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        mask += 2 * w;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_neon(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  assert(h % 4 == 0);
+  assert(w % 4 == 0);
+  assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
+
+  if (mask_type == DIFFWTD_38) {
+    diffwtd_mask_highbd_neon(mask, /*inverse=*/false, CONVERT_TO_SHORTPTR(src0),
+                             src0_stride, CONVERT_TO_SHORTPTR(src1),
+                             src1_stride, h, w, bd);
+  } else {  // mask_type == DIFFWTD_38_INV
+    diffwtd_mask_highbd_neon(mask, /*inverse=*/true, CONVERT_TO_SHORTPTR(src0),
+                             src0_stride, CONVERT_TO_SHORTPTR(src1),
+                             src1_stride, h, w, bd);
+  }
+}
diff --git a/av1/common/arm/highbd_reconintra_neon.c b/av1/common/arm/highbd_reconintra_neon.c
new file mode 100644
index 0000000..170491b
--- /dev/null
+++ b/av1/common/arm/highbd_reconintra_neon.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+
+#define MAX_UPSAMPLE_SZ 16
+
+void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+  assert(sz >= 0 && sz <= 129);
+
+  DECLARE_ALIGNED(16, static const uint16_t,
+                  idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  const uint16x8_t index = vld1q_u16(idx);
+
+  uint16_t edge[160];  // Max value of sz + enough padding for vector accesses.
+  memcpy(edge + 1, p, sz * sizeof(*p));
+
+  // Populate extra space appropriately.
+  edge[0] = edge[1];
+  edge[sz + 1] = edge[sz];
+  edge[sz + 2] = edge[sz];
+
+  // Don't overwrite first pixel.
+  uint16_t *dst = p + 1;
+  sz--;
+
+  if (strength == 1) {  // Filter: {4, 8, 4}.
+    const uint16_t *src = edge + 1;
+
+    while (sz >= 8) {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      // Make use of the identity:
+      // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+      uint16x8_t t0 = vaddq_u16(s0, s2);
+      uint16x8_t t1 = vaddq_u16(s1, s1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+      vst1q_u16(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      // Make use of the identity:
+      // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+      uint16x8_t t0 = vaddq_u16(s0, s2);
+      uint16x8_t t1 = vaddq_u16(s1, s1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+      // Mask off out-of-bounds indices.
+      uint16x8_t current_dst = vld1q_u16(dst);
+      uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+      res = vbslq_u16(mask, res, current_dst);
+
+      vst1q_u16(dst, res);
+    }
+  } else if (strength == 2) {  // Filter: {5, 6, 5}.
+    const uint16_t *src = edge + 1;
+
+    const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6),
+                                    vdupq_n_u16(5) } };
+    while (sz >= 8) {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+      accum = vmlaq_u16(accum, s1, filter.val[1]);
+      accum = vmlaq_u16(accum, s2, filter.val[2]);
+      uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+      vst1q_u16(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+      accum = vmlaq_u16(accum, s1, filter.val[1]);
+      accum = vmlaq_u16(accum, s2, filter.val[2]);
+      uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+      // Mask off out-of-bounds indices.
+      uint16x8_t current_dst = vld1q_u16(dst);
+      uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+      res = vbslq_u16(mask, res, current_dst);
+
+      vst1q_u16(dst, res);
+    }
+  } else {  // Filter {2, 4, 4, 4, 2}.
+    const uint16_t *src = edge;
+
+    while (sz >= 8) {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+      uint16x8_t s4 = vld1q_u16(src + 4);
+
+      // Make use of the identity:
+      // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+      uint16x8_t t0 = vaddq_u16(s0, s4);
+      uint16x8_t t1 = vaddq_u16(s1, s2);
+      t1 = vaddq_u16(t1, s3);
+      t1 = vaddq_u16(t1, t1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+      vst1q_u16(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+      uint16x8_t s4 = vld1q_u16(src + 4);
+
+      // Make use of the identity:
+      // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+      uint16x8_t t0 = vaddq_u16(s0, s4);
+      uint16x8_t t1 = vaddq_u16(s1, s2);
+      t1 = vaddq_u16(t1, s3);
+      t1 = vaddq_u16(t1, t1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+      // Mask off out-of-bounds indices.
+      uint16x8_t current_dst = vld1q_u16(dst);
+      uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+      res = vbslq_u16(mask, res, current_dst);
+
+      vst1q_u16(dst, res);
+    }
+  }
+}
+
+void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) {
+  if (!sz) return;
+
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint16_t edge[MAX_UPSAMPLE_SZ + 3];
+  const uint16_t *src = edge;
+
+  // Copy p[-1..(sz-1)] and pad out both ends.
+  edge[0] = p[-1];
+  edge[1] = p[-1];
+  memcpy(edge + 2, p, sz * 2);
+  edge[sz + 2] = p[sz - 1];
+  p[-2] = p[-1];
+
+  uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1);
+
+  uint16_t *dst = p - 1;
+
+  if (bd == 12) {
+    do {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+
+      uint16x8_t t0 = vaddq_u16(s1, s2);
+      uint16x8_t t1 = vaddq_u16(s0, s3);
+      uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9);
+      acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1)));
+      uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9);
+      acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1)));
+
+      uint16x8x2_t res;
+      res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4));
+      // Clamp pixel values at bitdepth maximum.
+      res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+      res.val[1] = s2;
+
+      vst2q_u16(dst, res);
+
+      src += 8;
+      dst += 16;
+      sz -= 8;
+    } while (sz > 0);
+  } else {  // Bit depth is 8 or 10.
+    do {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+
+      uint16x8_t t0 = vaddq_u16(s0, s3);
+      uint16x8_t t1 = vaddq_u16(s1, s2);
+      t1 = vmulq_n_u16(t1, 9);
+      t1 = vqsubq_u16(t1, t0);
+
+      uint16x8x2_t res;
+      res.val[0] = vrshrq_n_u16(t1, 4);
+      // Clamp pixel values at bitdepth maximum.
+      res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+      res.val[1] = s2;
+
+      vst2q_u16(dst, res);
+
+      src += 8;
+      dst += 16;
+      sz -= 8;
+    } while (sz > 0);
+  }
+}
diff --git a/av1/common/arm/highbd_warp_plane_neon.c b/av1/common/arm/highbd_warp_plane_neon.c
new file mode 100644
index 0000000..0729df6
--- /dev/null
+++ b/av1/common/arm/highbd_warp_plane_neon.c
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/scale.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+static INLINE int16x8_t load_filters_1(int ofs) {
+  const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
+
+  const int16_t *base =
+      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+  return vld1q_s16(base + ofs0 * 8);
+}
+
+static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) {
+  const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS);
+  const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS);
+  const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
+  const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
+
+  const int16_t *base =
+      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+  out[0] = vld1q_s16(base + ofs0 * 8);
+  out[1] = vld1q_s16(base + ofs1 * 8);
+  out[2] = vld1q_s16(base + ofs2 * 8);
+  out[3] = vld1q_s16(base + ofs3 * 8);
+}
+
+static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) {
+  const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS);
+  const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS);
+  const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
+  const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
+  const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS);
+  const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS);
+  const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
+  const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
+
+  const int16_t *base =
+      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+  out[0] = vld1q_s16(base + ofs0 * 8);
+  out[1] = vld1q_s16(base + ofs1 * 8);
+  out[2] = vld1q_s16(base + ofs2 * 8);
+  out[3] = vld1q_s16(base + ofs3 * 8);
+  out[4] = vld1q_s16(base + ofs4 * 8);
+  out[5] = vld1q_s16(base + ofs5 * 8);
+  out[6] = vld1q_s16(base + ofs6 * 8);
+  out[7] = vld1q_s16(base + ofs7 * 8);
+}
+
+static INLINE int16x8_t warp_affine_horizontal_step_4x1_f4_neon(
+    int bd, int sx, int alpha, uint16x8x2_t in) {
+  int16x8_t f[4];
+  load_filters_4(f, sx, alpha);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
+  m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
+  int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
+  m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1));
+  int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2));
+  m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2));
+  int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3));
+  m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3));
+
+  int32x4_t m0123[] = { m0, m1, m2, m3 };
+
+  const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res = horizontal_add_4d_s32x4(m0123);
+  res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+  res = vrshlq_s32(res, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t warp_affine_horizontal_step_8x1_f8_neon(
+    int bd, int sx, int alpha, uint16x8x2_t in) {
+  const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int16x8_t f[8];
+  load_filters_8(f, sx, alpha);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 4);
+  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 5);
+  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 6);
+  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 7);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
+  m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
+  int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
+  m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1));
+  int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2));
+  m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2));
+  int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3));
+  m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3));
+  int32x4_t m4 = vmull_s16(vget_low_s16(f[4]), vget_low_s16(rv4));
+  m4 = vmlal_s16(m4, vget_high_s16(f[4]), vget_high_s16(rv4));
+  int32x4_t m5 = vmull_s16(vget_low_s16(f[5]), vget_low_s16(rv5));
+  m5 = vmlal_s16(m5, vget_high_s16(f[5]), vget_high_s16(rv5));
+  int32x4_t m6 = vmull_s16(vget_low_s16(f[6]), vget_low_s16(rv6));
+  m6 = vmlal_s16(m6, vget_high_s16(f[6]), vget_high_s16(rv6));
+  int32x4_t m7 = vmull_s16(vget_low_s16(f[7]), vget_low_s16(rv7));
+  m7 = vmlal_s16(m7, vget_high_s16(f[7]), vget_high_s16(rv7));
+
+  int32x4_t m0123[] = { m0, m1, m2, m3 };
+  int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+  int32x4_t res0 = horizontal_add_4d_s32x4(m0123);
+  int32x4_t res1 = horizontal_add_4d_s32x4(m4567);
+  res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+  res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+  res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+  res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE void warp_affine_horizontal_neon(const uint16_t *ref, int width,
+                                               int height, int stride,
+                                               int p_width, int16_t alpha,
+                                               int16_t beta, int iy4, int sx4,
+                                               int ix4, int16x8_t tmp[],
+                                               int bd) {
+  const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+
+  if (ix4 <= -7) {
+    for (int k = 0; k < 15; ++k) {
+      int iy = clamp(iy4 + k - 7, 0, height - 1);
+      int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) +
+                        ref[iy * stride] * (1 << (FILTER_BITS - round0));
+      tmp[k] = vdupq_n_s16(dup_val);
+    }
+    return;
+  } else if (ix4 >= width + 6) {
+    for (int k = 0; k < 15; ++k) {
+      int iy = clamp(iy4 + k - 7, 0, height - 1);
+      int32_t dup_val =
+          (1 << (bd + FILTER_BITS - round0 - 1)) +
+          ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0));
+      tmp[k] = vdupq_n_s16(dup_val);
+    }
+    return;
+  }
+
+  for (int k = 0; k < 15; ++k) {
+    const int iy = clamp(iy4 + k - 7, 0, height - 1);
+    uint16x8x2_t in = vld1q_u16_x2(ref + iy * stride + ix4 - 7);
+
+    const int out_of_boundary_left = -(ix4 - 6);
+    const int out_of_boundary_right = (ix4 + 8) - width;
+
+    const uint16_t k0[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15 };
+    const uint16x8_t indx0 = vld1q_u16(&k0[0]);
+    const uint16x8_t indx1 = vld1q_u16(&k0[8]);
+
+    if (out_of_boundary_left >= 0) {
+      uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left);
+      uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]);
+      uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec);
+      uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec);
+      in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]);
+      in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]);
+    }
+    if (out_of_boundary_right >= 0) {
+      uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right);
+      uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]);
+      uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec);
+      uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec);
+      in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]);
+      in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]);
+    }
+
+    const int sx = sx4 + beta * (k - 3);
+    if (p_width == 4) {
+      tmp[k] = warp_affine_horizontal_step_4x1_f4_neon(bd, sx, alpha, in);
+    } else {
+      tmp[k] = warp_affine_horizontal_step_8x1_f8_neon(bd, sx, alpha, in);
+    }
+  }
+}
+
+static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) {
+  const int limit = (1 << bd) - 1;
+  return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit)));
+}
+
+static INLINE int32x4_t
+warp_affine_vertical_filter_4x1_f1_neon(const int16x8_t *tmp, int sy) {
+  const int16x8_t f = load_filters_1(sy);
+  const int16x4_t f0123 = vget_low_s16(f);
+  const int16x4_t f4567 = vget_high_s16(f);
+
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+  return m0123;
+}
+
+static INLINE int32x4x2_t
+warp_affine_vertical_filter_8x1_f1_neon(const int16x8_t *tmp, int sy) {
+  const int16x8_t f = load_filters_1(sy);
+  const int16x4_t f0123 = vget_low_s16(f);
+  const int16x4_t f4567 = vget_high_s16(f);
+
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+
+  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3);
+  return (int32x4x2_t){ { m0123, m4567 } };
+}
+
+static INLINE int32x4_t warp_affine_vertical_filter_4x1_f4_neon(
+    const int16x8_t *tmp, int sy, int gamma) {
+  int16x8_t s0, s1, s2, s3;
+  transpose_elems_s16_4x8(
+      vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]),
+      vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]),
+      vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3);
+
+  int16x8_t f[4];
+  load_filters_4(f, sy, gamma);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+  int32x4_t m0123[] = { m0, m1, m2, m3 };
+  return horizontal_add_4d_s32x4(m0123);
+}
+
+static INLINE int32x4x2_t warp_affine_vertical_filter_8x1_f8_neon(
+    const int16x8_t *tmp, int sy, int gamma) {
+  int16x8_t s0 = tmp[0];
+  int16x8_t s1 = tmp[1];
+  int16x8_t s2 = tmp[2];
+  int16x8_t s3 = tmp[3];
+  int16x8_t s4 = tmp[4];
+  int16x8_t s5 = tmp[5];
+  int16x8_t s6 = tmp[6];
+  int16x8_t s7 = tmp[7];
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  int16x8_t f[8];
+  load_filters_8(f, sy, gamma);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+  int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+  m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+  int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+  m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+  int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+  m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+  int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+  m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+  int32x4_t m0123[] = { m0, m1, m2, m3 };
+  int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+  int32x4x2_t ret;
+  ret.val[0] = horizontal_add_4d_s32x4(m0123);
+  ret.val[1] = horizontal_add_4d_s32x4(m4567);
+  return ret;
+}
+
+static INLINE void warp_affine_vertical_step_4x1_f4_neon(
+    uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride,
+    bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd,
+    int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) {
+  int32x4_t sum0 =
+      gamma == 0 ? warp_affine_vertical_filter_4x1_f1_neon(tmp, sy)
+                 : warp_affine_vertical_filter_4x1_f4_neon(tmp, sy, gamma);
+
+  const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - round0;
+
+  sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert));
+
+  uint16_t *dst16 = &pred[i * p_stride + j];
+
+  if (!is_compound) {
+    const int reduce_bits_vert = 2 * FILTER_BITS - round0;
+    sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert));
+
+    const int res_sub_const = (1 << (bd - 1)) + (1 << bd);
+    sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const));
+    uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd);
+    vst1_u16(dst16, res0);
+    return;
+  }
+
+  sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS);
+
+  uint16_t *p = &dst[i * dst_stride + j];
+
+  if (!do_average) {
+    vst1_u16(p, vqmovun_s32(sum0));
+    return;
+  }
+
+  uint16x4_t p0 = vld1_u16(p);
+  int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0));
+  if (use_dist_wtd_comp_avg) {
+    p_vec0 = vmulq_n_s32(p_vec0, fwd);
+    p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd);
+    p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS);
+  } else {
+    p_vec0 = vhaddq_s32(p_vec0, sum0);
+  }
+
+  const int offset_bits = bd + 2 * FILTER_BITS - round0;
+  const int round1 = COMPOUND_ROUND1_BITS;
+  const int res_sub_const =
+      (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1));
+  const int round_bits = 2 * FILTER_BITS - round0 - round1;
+
+  p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const));
+  p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits));
+  uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd);
+  vst1_u16(dst16, res0);
+}
+
+static INLINE void warp_affine_vertical_step_8x1_f8_neon(
+    uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride,
+    bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd,
+    int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) {
+  int32x4x2_t sums =
+      gamma == 0 ? warp_affine_vertical_filter_8x1_f1_neon(tmp, sy)
+                 : warp_affine_vertical_filter_8x1_f8_neon(tmp, sy, gamma);
+  int32x4_t sum0 = sums.val[0];
+  int32x4_t sum1 = sums.val[1];
+
+  const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - round0;
+
+  sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert));
+  sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert));
+
+  uint16_t *dst16 = &pred[i * p_stride + j];
+
+  if (!is_compound) {
+    const int reduce_bits_vert = 2 * FILTER_BITS - round0;
+    sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert));
+    sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert));
+
+    const int res_sub_const = (1 << (bd - 1)) + (1 << bd);
+    sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const));
+    sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const));
+    uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd);
+    uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd);
+    vst1_u16(dst16, res0);
+    vst1_u16(dst16 + 4, res1);
+    return;
+  }
+
+  sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS);
+  sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+  uint16_t *p = &dst[i * dst_stride + j];
+
+  if (!do_average) {
+    vst1_u16(p, vqmovun_s32(sum0));
+    vst1_u16(p + 4, vqmovun_s32(sum1));
+    return;
+  }
+
+  uint16x8_t p0 = vld1q_u16(p);
+  int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0)));
+  int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0)));
+  if (use_dist_wtd_comp_avg) {
+    p_vec0 = vmulq_n_s32(p_vec0, fwd);
+    p_vec1 = vmulq_n_s32(p_vec1, fwd);
+    p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd);
+    p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd);
+    p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS);
+    p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS);
+  } else {
+    p_vec0 = vhaddq_s32(p_vec0, sum0);
+    p_vec1 = vhaddq_s32(p_vec1, sum1);
+  }
+
+  const int offset_bits = bd + 2 * FILTER_BITS - round0;
+  const int round1 = COMPOUND_ROUND1_BITS;
+  const int res_sub_const =
+      (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1));
+  const int round_bits = 2 * FILTER_BITS - round0 - round1;
+
+  p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const));
+  p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const));
+
+  p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits));
+  p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits));
+  uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd);
+  uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd);
+  vst1_u16(dst16, res0);
+  vst1_u16(dst16 + 4, res1);
+}
+
+static INLINE void warp_affine_vertical_neon(
+    uint16_t *pred, int p_width, int p_height, int p_stride, int bd,
+    uint16_t *dst, int dst_stride, bool is_compound, bool do_average,
+    bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta,
+    const int16x8_t *tmp, int i, int sy4, int j) {
+  int limit_height = p_height > 4 ? 8 : 4;
+
+  if (p_width > 4) {
+    // p_width == 8
+    for (int k = 0; k < limit_height; ++k) {
+      int sy = sy4 + delta * k;
+      warp_affine_vertical_step_8x1_f8_neon(
+          pred, p_stride, bd, dst, dst_stride, is_compound, do_average,
+          use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j);
+    }
+  } else {
+    // p_width == 4
+    for (int k = 0; k < limit_height; ++k) {
+      int sy = sy4 + delta * k;
+      warp_affine_vertical_step_4x1_f4_neon(
+          pred, p_stride, bd, dst, dst_stride, is_compound, do_average,
+          use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j);
+    }
+  }
+}
+
+void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref,
+                                 int width, int height, int stride,
+                                 uint16_t *pred, int p_col, int p_row,
+                                 int p_width, int p_height, int p_stride,
+                                 int subsampling_x, int subsampling_y, int bd,
+                                 ConvolveParams *conv_params, int16_t alpha,
+                                 int16_t beta, int16_t gamma, int16_t delta) {
+  uint16_t *const dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const bool is_compound = conv_params->is_compound;
+  const bool do_average = conv_params->do_average;
+  const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int fwd = conv_params->fwd_offset;
+  const int bwd = conv_params->bck_offset;
+
+  assert(IMPLIES(is_compound, dst != NULL));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4 + p_col) << subsampling_x;
+      const int32_t src_y = (i + 4 + p_row) << subsampling_y;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Each horizontal filter result is formed by the sum of up to eight
+      // multiplications by filter values and then a shift. Although both the
+      // inputs and filters are loaded as int16, the input data is at most bd
+      // bits and the filters are at most 8 bits each. Additionally since we
+      // know all possible filter values we know that the sum of absolute
+      // filter values will fit in at most 9 bits. With this in mind we can
+      // conclude that the sum of each filter application will fit in bd + 9
+      // bits. The shift following the summation is ROUND0_BITS (which is 3),
+      // +2 for 12-bit, which gives us a final storage of:
+      // bd ==  8: ( 8 + 9) - 3 => 14 bits
+      // bd == 10: (10 + 9) - 3 => 16 bits
+      // bd == 12: (12 + 9) - 5 => 16 bits
+      // So it is safe to use int16x8_t as the intermediate storage type here.
+      int16x8_t tmp[15];
+
+      warp_affine_horizontal_neon(ref, width, height, stride, p_width, alpha,
+                                  beta, iy4, sx4, ix4, tmp, bd);
+      warp_affine_vertical_neon(pred, p_width, p_height, p_stride, bd, dst,
+                                dst_stride, is_compound, do_average,
+                                use_dist_wtd_comp_avg, fwd, bwd, gamma, delta,
+                                tmp, i, sy4, j);
+    }
+  }
+}
diff --git a/av1/common/arm/highbd_wiener_convolve_neon.c b/av1/common/arm/highbd_wiener_convolve_neon.c
new file mode 100644
index 0000000..a6bd6d3
--- /dev/null
+++ b/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#define HBD_WIENER_5TAP_HORIZ(name, shift)                              \
+  static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h(              \
+      const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,       \
+      const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \
+      const int32x4_t round_vec, const uint16x8_t im_max_val) {         \
+    /* Wiener filter is symmetric so add mirrored source elements. */   \
+    int16x8_t s04 = vaddq_s16(s0, s4);                                  \
+    int16x8_t s13 = vaddq_s16(s1, s3);                                  \
+                                                                        \
+    /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */      \
+    int32x4_t sum_lo =                                                  \
+        vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1);      \
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2);    \
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3);     \
+                                                                        \
+    int32x4_t sum_hi =                                                  \
+        vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1);     \
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2);   \
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3);    \
+                                                                        \
+    uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift);                  \
+    uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift);                  \
+                                                                        \
+    return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val);         \
+  }                                                                     \
+                                                                        \
+  static INLINE void name##_convolve_add_src_5tap_horiz(                \
+      const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+      ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,     \
+      const int32x4_t round_vec, const uint16x8_t im_max_val) {         \
+    do {                                                                \
+      const int16_t *s = (int16_t *)src_ptr;                            \
+      uint16_t *d = dst_ptr;                                            \
+      int width = w;                                                    \
+                                                                        \
+      do {                                                              \
+        int16x8_t s0, s1, s2, s3, s4;                                   \
+        load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4);                    \
+                                                                        \
+        uint16x8_t d0 = name##_wiener_convolve5_8_2d_h(                 \
+            s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val);       \
+                                                                        \
+        vst1q_u16(d, d0);                                               \
+                                                                        \
+        s += 8;                                                         \
+        d += 8;                                                         \
+        width -= 8;                                                     \
+      } while (width != 0);                                             \
+      src_ptr += src_stride;                                            \
+      dst_ptr += dst_stride;                                            \
+    } while (--h != 0);                                                 \
+  }
+
+HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_5TAP_HORIZ
+
+#define HBD_WIENER_7TAP_HORIZ(name, shift)                                     \
+  static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h(                     \
+      const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,              \
+      const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,              \
+      const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \
+      const uint16x8_t im_max_val) {                                           \
+    /* Wiener filter is symmetric so add mirrored source elements. */          \
+    int16x8_t s06 = vaddq_s16(s0, s6);                                         \
+    int16x8_t s15 = vaddq_s16(s1, s5);                                         \
+    int16x8_t s24 = vaddq_s16(s2, s4);                                         \
+                                                                               \
+    int32x4_t sum_lo =                                                         \
+        vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);             \
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);           \
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);           \
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);            \
+                                                                               \
+    int32x4_t sum_hi =                                                         \
+        vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);            \
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);          \
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);          \
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);           \
+                                                                               \
+    uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift);                         \
+    uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift);                         \
+                                                                               \
+    return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val);                \
+  }                                                                            \
+                                                                               \
+  static INLINE void name##_convolve_add_src_7tap_horiz(                       \
+      const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,        \
+      ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,            \
+      const int32x4_t round_vec, const uint16x8_t im_max_val) {                \
+    do {                                                                       \
+      const int16_t *s = (int16_t *)src_ptr;                                   \
+      uint16_t *d = dst_ptr;                                                   \
+      int width = w;                                                           \
+                                                                               \
+      do {                                                                     \
+        int16x8_t s0, s1, s2, s3, s4, s5, s6;                                  \
+        load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);                 \
+                                                                               \
+        uint16x8_t d0 = name##_wiener_convolve7_8_2d_h(                        \
+            s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val);      \
+                                                                               \
+        vst1q_u16(d, d0);                                                      \
+                                                                               \
+        s += 8;                                                                \
+        d += 8;                                                                \
+        width -= 8;                                                            \
+      } while (width != 0);                                                    \
+      src_ptr += src_stride;                                                   \
+      dst_ptr += dst_stride;                                                   \
+    } while (--h != 0);                                                        \
+  }
+
+HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_7TAP_HORIZ
+
+#define HBD_WIENER_5TAP_VERT(name, shift)                                     \
+  static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v(                    \
+      const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,             \
+      const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,       \
+      const int32x4_t round_vec, const uint16x8_t res_max_val) {              \
+    const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter));          \
+    const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter));         \
+    /* Wiener filter is symmetric so add mirrored source elements. */         \
+    int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4));         \
+    int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3));         \
+                                                                              \
+    /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */            \
+    int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1);     \
+    sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0);                  \
+    sum_lo =                                                                  \
+        vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1);  \
+                                                                              \
+    int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4));       \
+    int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3));       \
+                                                                              \
+    int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1);     \
+    sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0);                  \
+    sum_hi =                                                                  \
+        vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \
+                                                                              \
+    uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift);                        \
+    uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift);                        \
+                                                                              \
+    return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val);              \
+  }                                                                           \
+                                                                              \
+  static INLINE void name##_convolve_add_src_5tap_vert(                       \
+      const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,       \
+      ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,           \
+      const int32x4_t round_vec, const uint16x8_t res_max_val) {              \
+    do {                                                                      \
+      const int16_t *s = (int16_t *)src_ptr;                                  \
+      uint16_t *d = dst_ptr;                                                  \
+      int height = h;                                                         \
+                                                                              \
+      while (height > 3) {                                                    \
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;                             \
+        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);  \
+                                                                              \
+        uint16x8_t d0 = name##_wiener_convolve5_8_2d_v(                       \
+            s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val);            \
+        uint16x8_t d1 = name##_wiener_convolve5_8_2d_v(                       \
+            s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val);            \
+        uint16x8_t d2 = name##_wiener_convolve5_8_2d_v(                       \
+            s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val);            \
+        uint16x8_t d3 = name##_wiener_convolve5_8_2d_v(                       \
+            s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val);            \
+                                                                              \
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);                         \
+                                                                              \
+        s += 4 * src_stride;                                                  \
+        d += 4 * dst_stride;                                                  \
+        height -= 4;                                                          \
+      }                                                                       \
+                                                                              \
+      while (height-- != 0) {                                                 \
+        int16x8_t s0, s1, s2, s3, s4;                                         \
+        load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);                 \
+                                                                              \
+        uint16x8_t d0 = name##_wiener_convolve5_8_2d_v(                       \
+            s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val);            \
+                                                                              \
+        vst1q_u16(d, d0);                                                     \
+                                                                              \
+        s += src_stride;                                                      \
+        d += dst_stride;                                                      \
+      }                                                                       \
+                                                                              \
+      src_ptr += 8;                                                           \
+      dst_ptr += 8;                                                           \
+      w -= 8;                                                                 \
+    } while (w != 0);                                                         \
+  }
+
+HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_5TAP_VERT
+
+#define HBD_WIENER_7TAP_VERT(name, shift)                                      \
+  static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v(                     \
+      const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,              \
+      const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,              \
+      const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \
+      const uint16x8_t res_max_val) {                                          \
+    const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter));           \
+    const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter));          \
+    /* Wiener filter is symmetric so add mirrored source elements. */          \
+    int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6));          \
+    int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5));          \
+    int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4));          \
+                                                                               \
+    int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0);      \
+    sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1);                   \
+    sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0);                   \
+    sum_lo =                                                                   \
+        vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1);   \
+                                                                               \
+    int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6));        \
+    int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5));        \
+    int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4));        \
+                                                                               \
+    int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0);      \
+    sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1);                   \
+    sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0);                   \
+    sum_hi =                                                                   \
+        vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1);  \
+                                                                               \
+    uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift);                         \
+    uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift);                         \
+                                                                               \
+    return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val);               \
+  }                                                                            \
+                                                                               \
+  static INLINE void name##_convolve_add_src_7tap_vert(                        \
+      const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,        \
+      ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,            \
+      const int32x4_t round_vec, const uint16x8_t res_max_val) {               \
+    do {                                                                       \
+      const int16_t *s = (int16_t *)src_ptr;                                   \
+      uint16_t *d = dst_ptr;                                                   \
+      int height = h;                                                          \
+                                                                               \
+      while (height > 3) {                                                     \
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;                      \
+        load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,   \
+                      &s8, &s9);                                               \
+                                                                               \
+        uint16x8_t d0 = name##_wiener_convolve7_8_2d_v(                        \
+            s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val);     \
+        uint16x8_t d1 = name##_wiener_convolve7_8_2d_v(                        \
+            s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val);     \
+        uint16x8_t d2 = name##_wiener_convolve7_8_2d_v(                        \
+            s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val);     \
+        uint16x8_t d3 = name##_wiener_convolve7_8_2d_v(                        \
+            s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val);     \
+                                                                               \
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);                          \
+                                                                               \
+        s += 4 * src_stride;                                                   \
+        d += 4 * dst_stride;                                                   \
+        height -= 4;                                                           \
+      }                                                                        \
+                                                                               \
+      while (height-- != 0) {                                                  \
+        int16x8_t s0, s1, s2, s3, s4, s5, s6;                                  \
+        load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);        \
+                                                                               \
+        uint16x8_t d0 = name##_wiener_convolve7_8_2d_v(                        \
+            s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val);     \
+                                                                               \
+        vst1q_u16(d, d0);                                                      \
+                                                                               \
+        s += src_stride;                                                       \
+        d += dst_stride;                                                       \
+      }                                                                        \
+                                                                               \
+      src_ptr += 8;                                                            \
+      dst_ptr += 8;                                                            \
+      w -= 8;                                                                  \
+    } while (w != 0);                                                          \
+  }
+
+HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_7TAP_VERT
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+  assert(filter[7] == 0);
+  if (filter[0] == 0 && filter[6] == 0) {
+    return WIENER_WIN_REDUCED;
+  }
+  return WIENER_WIN;
+}
+
+void av1_highbd_wiener_convolve_add_src_neon(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4,
+    const int16_t *y_filter, int y_step_q4, int w, int h,
+    const WienerConvolveParams *conv_params, int bd) {
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  assert(w % 8 == 0);
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(x_filter[7] == 0 && y_filter[7] == 0);
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+  const int x_filter_taps = get_wiener_filter_taps(x_filter);
+  const int y_filter_taps = get_wiener_filter_taps(y_filter);
+  int16x4_t x_filter_s16 = vld1_s16(x_filter);
+  int16x4_t y_filter_s16 = vld1_s16(y_filter);
+  // Add 128 to tap 3. (Needed for rounding.)
+  x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+  y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
+
+  const int im_stride = MAX_SB_SIZE;
+  const int im_h = h + y_filter_taps - 1;
+  const int horiz_offset = x_filter_taps / 2;
+  const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
+
+  const int extraprec_clamp_limit =
+      WIENER_CLAMP_LIMIT(conv_params->round_0, bd);
+  const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1);
+  const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+  const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1);
+  const int32x4_t vert_round_vec =
+      vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1)));
+
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  if (bd == 12) {
+    if (x_filter_taps == WIENER_WIN_REDUCED) {
+      highbd_12_convolve_add_src_5tap_horiz(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, x_filter_s16, horiz_round_vec, im_max_val);
+    } else {
+      highbd_12_convolve_add_src_7tap_horiz(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, x_filter_s16, horiz_round_vec, im_max_val);
+    }
+
+    if (y_filter_taps == WIENER_WIN_REDUCED) {
+      highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride,
+                                           w, h, y_filter_s16, vert_round_vec,
+                                           res_max_val);
+    } else {
+      highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride,
+                                           w, h, y_filter_s16, vert_round_vec,
+                                           res_max_val);
+    }
+
+  } else {
+    if (x_filter_taps == WIENER_WIN_REDUCED) {
+      highbd_convolve_add_src_5tap_horiz(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, x_filter_s16, horiz_round_vec, im_max_val);
+    } else {
+      highbd_convolve_add_src_7tap_horiz(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, x_filter_s16, horiz_round_vec, im_max_val);
+    }
+
+    if (y_filter_taps == WIENER_WIN_REDUCED) {
+      highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w,
+                                        h, y_filter_s16, vert_round_vec,
+                                        res_max_val);
+    } else {
+      highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w,
+                                        h, y_filter_s16, vert_round_vec,
+                                        res_max_val);
+    }
+  }
+}
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
deleted file mode 100644
index 564f7c2..0000000
--- a/av1/common/arm/jnt_convolve_neon.c
+++ /dev/null
@@ -1,5336 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/arm/mem_neon.h"
-#include "aom_dsp/arm/transpose_neon.h"
-#include "aom_ports/mem.h"
-#include "av1/common/common.h"
-#include "av1/common/arm/convolve_neon.h"
-
-#if !AOM_ARCH_AARCH64
-static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
-                                            const uint16_t fwd_offset,
-                                            const uint16_t bck_offset,
-                                            const int16x4_t round_offset,
-                                            uint8x8_t *d0_u8) {
-  uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
-  blend0 = vmlal_n_u16(blend0, d0, bck_offset);
-
-  uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
-
-  int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
-
-  int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
-
-  *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
-                                         const int16x4_t round_offset,
-                                         uint8x8_t *d0_u8) {
-  uint16x4_t avg0 = vhadd_u16(dd0, d0);
-
-  int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
-
-  int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
-
-  *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
-                                            const uint16_t fwd_offset,
-                                            const uint16_t bck_offset,
-                                            const int16x8_t round_offset,
-                                            uint8x8_t *d0_u8) {
-  uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
-  blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
-  uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
-  blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
-
-  uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
-                                 vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
-
-  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
-
-  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
-                                         const int16x8_t round_offset,
-                                         uint8x8_t *d0_u8) {
-  uint16x8_t avg0 = vhaddq_u16(dd0, d0);
-
-  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
-
-  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
-}
-
-#endif  // !AOM_ARCH_AARCH64
-
-static INLINE void compute_dist_wtd_avg_4x4(
-    uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
-    uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
-    const uint16_t fwd_offset, const uint16_t bck_offset,
-    const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
-  uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
-  blend0 = vmlal_n_u16(blend0, d0, bck_offset);
-  uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
-  blend1 = vmlal_n_u16(blend1, d1, bck_offset);
-  uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
-  blend2 = vmlal_n_u16(blend2, d2, bck_offset);
-  uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
-  blend3 = vmlal_n_u16(blend3, d3, bck_offset);
-
-  uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
-  uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
-  uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
-  uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);
-
-  int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
-  int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
-
-  dst_01 = vsubq_s16(dst_01, round_offset);
-  dst_23 = vsubq_s16(dst_23, round_offset);
-
-  *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
-  *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
-                                         uint16x4_t dd2, uint16x4_t dd3,
-                                         uint16x4_t d0, uint16x4_t d1,
-                                         uint16x4_t d2, uint16x4_t d3,
-                                         const int16x8_t round_offset,
-                                         uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
-  uint16x4_t avg0 = vhadd_u16(dd0, d0);
-  uint16x4_t avg1 = vhadd_u16(dd1, d1);
-  uint16x4_t avg2 = vhadd_u16(dd2, d2);
-  uint16x4_t avg3 = vhadd_u16(dd3, d3);
-
-  int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
-  int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
-
-  dst_01 = vsubq_s16(dst_01, round_offset);
-  dst_23 = vsubq_s16(dst_23, round_offset);
-
-  *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
-  *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_dist_wtd_avg_8x4(
-    uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
-    uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
-    const uint16_t fwd_offset, const uint16_t bck_offset,
-    const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8,
-    uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
-  uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
-  blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
-  uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
-  blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
-
-  uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
-  blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
-  uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
-  blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);
-
-  uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
-  blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
-  uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
-  blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);
-
-  uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
-  blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
-  uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
-  blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);
-
-  uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
-                                 vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
-  uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
-                                 vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
-  uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
-                                 vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
-  uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
-                                 vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));
-
-  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
-  int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
-  int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
-  int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
-
-  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
-  *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
-  *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
-  *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
-}
-
-static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
-                                         uint16x8_t dd2, uint16x8_t dd3,
-                                         uint16x8_t d0, uint16x8_t d1,
-                                         uint16x8_t d2, uint16x8_t d3,
-                                         const int16x8_t round_offset,
-                                         uint8x8_t *d0_u8, uint8x8_t *d1_u8,
-                                         uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
-  uint16x8_t avg0, avg1, avg2, avg3;
-
-  avg0 = vhaddq_u16(dd0, d0);
-  avg1 = vhaddq_u16(dd1, d1);
-  avg2 = vhaddq_u16(dd2, d2);
-  avg3 = vhaddq_u16(dd3, d3);
-
-  int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
-  int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
-  int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
-  int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
-
-  *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
-  *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
-  *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
-  *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
-}
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve8_4_2d_h(uint8x16_t samples,
-                                         const int8x8_t x_filter,
-                                         const uint8x16x2_t permute_tbl,
-                                         const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  // First 4 output values.
-  sum = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshrn_n_s32(sum, ROUND0_BITS - 1);
-}
-
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
-                                         const int8x8_t x_filter,
-                                         const uint8x16x3_t permute_tbl,
-                                         const int32x4_t horiz_const) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  // First 4 output values.
-  sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
-  // Second 4 output values.
-  sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
-  // Narrow and re-pack.
-  // We halved the convolution filter values so -1 from the right shift.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE void dist_wtd_convolve_2d_horiz_8tap_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
-    const int16x8_t x_filter_s16, const int im_h, int w) {
-  const int bd = 8;
-  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // (The extra -1 is needed because we halved the filter values.)
-  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
-                                            (1 << ((ROUND0_BITS - 1) - 1)));
-  // Horizontal filter.
-  const int8x8_t x_filter = vmovn_s16(x_filter_s16);
-
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-  int height = im_h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x4_t d0, d1, d2, d3;
-
-    do {
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-      d1 = convolve8_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-      d2 = convolve8_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-      d3 = convolve8_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t d0, d1, d2, d3;
-
-    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
-  }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE int16x4_t convolve8_4_2d_h(uint8x16_t samples,
-                                         const int8x8_t x_filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  // Accumulate dot product into 'correction' to account for range clamp.
-  sum = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshrn_n_s32(sum, ROUND0_BITS - 1);
-}
-
-static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
-                                         const int8x8_t x_filter,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum[2];
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  // Permute samples ready for dot product. */
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
-  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
-  // Second 4 output values.
-  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
-  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
-  // Narrow and re-pack.
-  // We halved the convolution filter values so -1 from the right shift.
-  return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                      vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-}
-
-static INLINE void dist_wtd_convolve_2d_horiz_8tap_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
-    const int16x8_t x_filter_s16, const int im_h, int w) {
-  const int bd = 8;
-  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
-  // Dot product constants and other shims.
-  const int32_t correction_s32 = vaddlvq_s16(vshlq_n_s16(x_filter_s16, 7));
-  // Fold horiz_const into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
-  const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
-                                           (1 << ((ROUND0_BITS - 1) - 1)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  // Horizontal filter.
-  const int8x8_t x_filter = vmovn_s16(x_filter_s16);
-
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-  int height = im_h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x4_t d0, d1, d2, d3;
-
-    do {
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
-
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t d0, d1, d2, d3;
-
-    do {
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
-                              permute_tbl);
-        d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
-                              permute_tbl);
-        d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
-                              permute_tbl);
-        d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
-                              permute_tbl);
-
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
-  }
-}
-
-#else  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE int16x4_t convolve8_4_2d_h(const int16x4_t s0, const int16x4_t s1,
-                                         const int16x4_t s2, const int16x4_t s3,
-                                         const int16x4_t s4, const int16x4_t s5,
-                                         const int16x4_t s6, const int16x4_t s7,
-                                         const int16x8_t x_filter,
-                                         const int16x4_t horiz_const) {
-  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
-  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
-  int16x4_t sum = horiz_const;
-  sum = vmla_lane_s16(sum, s0, x_filter_0_3, 0);
-  sum = vmla_lane_s16(sum, s1, x_filter_0_3, 1);
-  sum = vmla_lane_s16(sum, s2, x_filter_0_3, 2);
-  sum = vmla_lane_s16(sum, s3, x_filter_0_3, 3);
-  sum = vmla_lane_s16(sum, s4, x_filter_4_7, 0);
-  sum = vmla_lane_s16(sum, s5, x_filter_4_7, 1);
-  sum = vmla_lane_s16(sum, s6, x_filter_4_7, 2);
-  sum = vmla_lane_s16(sum, s7, x_filter_4_7, 3);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshr_n_s16(sum, ROUND0_BITS - 1);
-}
-
-static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
-                                         const int16x8_t s2, const int16x8_t s3,
-                                         const int16x8_t s4, const int16x8_t s5,
-                                         const int16x8_t s6, const int16x8_t s7,
-                                         const int16x8_t x_filter,
-                                         const int16x8_t horiz_const) {
-  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
-  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
-  int16x8_t sum = horiz_const;
-  sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
-  sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
-  sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
-  sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
-  sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
-  sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
-  sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
-  sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vshrq_n_s16(sum, ROUND0_BITS - 1);
-}
-
-static INLINE void dist_wtd_convolve_2d_horiz_8tap_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
-    const int16x8_t x_filter, const int im_h, int w) {
-  const int bd = 8;
-
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-  int height = im_h;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-    uint8x8_t t0;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10, d1, d2, d3;
-    uint8x8_t t1, t2, t3;
-#endif  // AOM_ARCH_AARCH64
-
-    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // (The extra -1 is needed because we halved the filter values.)
-    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
-                                             (1 << ((ROUND0_BITS - 1) - 1)));
-    do {
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-      load_u8_8x4(src_ptr + 7, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                            horiz_const);
-      d1 = convolve8_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                            horiz_const);
-      d2 = convolve8_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                            horiz_const);
-      d3 = convolve8_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                            horiz_const);
-
-      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-#else   // !AOM_ARCH_AARCH64
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));   // a0 a1 a2 a3
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));  // a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      t0 = vld1_u8(src_ptr + 8);  // a8 a9 a10 a11
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
-      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
-      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
-
-      d0 = convolve8_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                            horiz_const);
-      vst1_s16(dst_ptr, d0);
-
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height > 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, d0;
-    uint8x8_t t0;
-#if AOM_ARCH_AARCH64
-    int16x8_t s9, s10, s11, s12, s13, s14;
-    int16x8_t d1, d2, d3, d4, d5, d6, d7;
-    uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    // (The extra -1 is needed because we halved the filter values.)
-    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
-                                              (1 << ((ROUND0_BITS - 1) - 1)));
-    do {
-      const uint8_t *s;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-#if AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s = src_ptr + 7;
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                              horiz_const);
-        d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                              horiz_const);
-        d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                              horiz_const);
-        d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                              horiz_const);
-        d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                              horiz_const);
-        d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
-                              horiz_const);
-        d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
-                              horiz_const);
-        d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
-                              horiz_const);
-
-        transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-      height -= 8;
-#else   // !AOM_ARCH_AARCH64
-      t0 = vld1_u8(src_ptr);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      s = src_ptr + 8;
-      __builtin_prefetch(dst_ptr);
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
-        s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                              horiz_const);
-        vst1q_s16(d, d0);
-
-        s0 = s8;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height > 0);
-  }
-}
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint16x4_t
-convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  int32x4_t sum = offset_const;
-  // Filter values at indices 0 and 7 are 0.
-  sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1);
-  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
-  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
-  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
-  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
-  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
-
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x8_t
-convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  int32x4_t sum0 = offset_const;
-  // Filter values at indices 0 and 7 are 0.
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
-
-  int32x4_t sum1 = offset_const;
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
-    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
-    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5;
-    uint16x4_t dd0, d0;
-    uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8;
-    uint16x4_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d23_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
-    src_ptr += 5 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
-
-      d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-      d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
-      d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
-      d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-      dst8_ptr += 4 * dst8_stride;
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s5 = vld1_s16(src_ptr);
-
-      d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
-      dd0 = vld1_u16(dst_ptr);
-
-      compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
-                               vget_low_s16(round_offset_vec), &d01_u8);
-
-      store_u8_4x1(dst8_ptr, d01_u8, 0);
-      dst8_ptr += dst8_stride;
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint16x8_t dd0, d0;
-    uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8;
-    uint16x8_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int16_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
-        d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-        d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
-        d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
-        d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vld1q_s16(s);
-
-        d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
-        dd0 = vld1q_u16(d);
-
-        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
-                                 round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
-    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
-    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5;
-    uint16x4_t dd0, d0;
-    uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8;
-    uint16x4_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d23_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
-    src_ptr += 5 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
-
-      d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-      d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
-      d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
-      d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                            round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-      dst8_ptr += 4 * dst8_stride;
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s5 = vld1_s16(src_ptr);
-
-      d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
-      dd0 = vld1_u16(dst_ptr);
-
-      compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
-
-      store_u8_4x1(dst8_ptr, d01_u8, 0);
-      dst8_ptr += dst8_stride;
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint16x8_t dd0, d0;
-    uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8;
-    uint16x8_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int16_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
-        d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-        d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
-        d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
-        d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vld1q_s16(s);
-
-        d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
-        dd0 = vld1q_u16(d);
-
-        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
-    int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
-    const int16x8_t y_filter, int h, int w) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5;
-    uint16x4_t d0;
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8;
-    uint16x4_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
-    src_ptr += 5 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
-
-      d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-      d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
-      d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
-      d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
-      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s5 = vld1_s16(src_ptr);
-
-      d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
-      vst1_u16(dst_ptr, d0);
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint16x8_t d0;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8;
-    uint16x8_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int16_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int height = h;
-
-      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
-
-        d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-        d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
-        d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
-        d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vld1q_s16(s);
-
-        d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
-
-        vst1q_u16(d, d0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE uint16x4_t
-convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                 const int16x4_t s6, const int16x4_t s7,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  int32x4_t sum = offset_const;
-  sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0);
-  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
-  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
-  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
-  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
-  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
-  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
-  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
-
-  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x8_t
-convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                 const int16x8_t s6, const int16x8_t s7,
-                 const int16x8_t y_filter, const int32x4_t offset_const) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  int32x4_t sum0 = offset_const;
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
-
-  int32x4_t sum1 = offset_const;
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
-
-  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
-                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
-    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
-    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t dd0, d0;
-    uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10;
-    uint16x4_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d23_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src_ptr += 7 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
-
-      d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                            offset_const);
-      d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                            offset_const);
-      d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                            offset_const);
-      d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                            offset_const);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-      dst8_ptr += 4 * dst8_stride;
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s7 = vld1_s16(src_ptr);
-
-      d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                            offset_const);
-
-      dd0 = vld1_u16(dst_ptr);
-
-      compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
-                               vget_low_s16(round_offset_vec), &d01_u8);
-
-      store_u8_4x1(dst8_ptr, d01_u8, 0);
-      dst8_ptr += dst8_stride;
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s5 = s6;
-      s6 = s7;
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t dd0, d0;
-    uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10;
-    uint16x8_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int16_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                              offset_const);
-        d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                              offset_const);
-        d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                              offset_const);
-        d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                              offset_const);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vld1q_s16(s);
-
-        d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                              offset_const);
-
-        dd0 = vld1q_u16(d);
-
-        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
-                                 round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
-    int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
-    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t dd0, d0;
-    uint8x8_t d01_u8;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10;
-    uint16x4_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d23_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src_ptr += 7 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
-
-      d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                            offset_const);
-      d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                            offset_const);
-      d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                            offset_const);
-      d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                            offset_const);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                            round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-      dst8_ptr += 4 * dst8_stride;
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s7 = vld1_s16(src_ptr);
-
-      d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                            offset_const);
-
-      dd0 = vld1_u16(dst_ptr);
-
-      compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
-
-      store_u8_4x1(dst8_ptr, d01_u8, 0);
-      dst8_ptr += dst8_stride;
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s5 = s6;
-      s6 = s7;
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t dd0, d0;
-    uint8x8_t d0_u8;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10;
-    uint16x8_t dd1, dd2, dd3, d1, d2, d3;
-    uint8x8_t d1_u8, d2_u8, d3_u8;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int16_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                              offset_const);
-        d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                              offset_const);
-        d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                              offset_const);
-        d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                              offset_const);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vld1q_s16(s);
-
-        d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                              offset_const);
-
-        dd0 = vld1q_u16(d);
-
-        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
-    int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
-    const int16x8_t y_filter, int h, int w) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t d0;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10;
-    uint16x4_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src_ptr += 7 * src_stride;
-
-    do {
-#if AOM_ARCH_AARCH64
-      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
-
-      d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                            offset_const);
-      d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                            offset_const);
-      d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                            offset_const);
-      d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                            offset_const);
-
-      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      h -= 4;
-#else   // !AOM_ARCH_AARCH64
-      s7 = vld1_s16(src_ptr);
-
-      d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                            offset_const);
-
-      vst1_u16(dst_ptr, d0);
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s5 = s6;
-      s6 = s7;
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      h--;
-#endif  // AOM_ARCH_AARCH64
-    } while (h != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10;
-    uint16x8_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      int16_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int height = h;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                              offset_const);
-        d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                              offset_const);
-        d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                              offset_const);
-        d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                              offset_const);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vld1q_s16(s);
-
-        d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                              offset_const);
-
-        vst1q_u16(d, d0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
-                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
-
-  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
-
-  const int im_h = h + filter_params_y->taps - 1;
-  const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
-
-  dist_wtd_convolve_2d_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
-                                       x_filter, im_h, w);
-
-  if (clamped_y_taps == 6) {
-    if (conv_params->do_average) {
-      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-        dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
-            im_block + im_stride, im_stride, dst8, dst8_stride, conv_params,
-            y_filter, h, w);
-      } else {
-        dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block + im_stride, im_stride,
-                                                dst8, dst8_stride, conv_params,
-                                                y_filter, h, w);
-      }
-    } else {
-      dist_wtd_convolve_2d_vert_6tap_neon(im_block + im_stride, im_stride,
-                                          conv_params, y_filter, h, w);
-    }
-  } else {
-    if (conv_params->do_average) {
-      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-        dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
-            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
-            w);
-      } else {
-        dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
-                                                dst8_stride, conv_params,
-                                                y_filter, h, w);
-      }
-    } else {
-      dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
-                                          y_filter, h, w);
-    }
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
-  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    uint8x8_t s0, s1, s2, s3, d01, d23;
-    uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
-      d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
-      d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
-      d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
-
-      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_dist_wtd_avg_4x4(
-          dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
-          vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
-
-      store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
-      store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
-      store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
-      store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      dst8 += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    uint8x8_t s0, s1, s2, s3, d0_u8, d1_u8, d2_u8, d3_u8;
-    uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      const uint8_t *s = src;
-      CONV_BUF_TYPE *d = dst;
-      uint8_t *d_u8 = dst8;
-      int width = w;
-
-      do {
-        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
-        d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
-        d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
-        d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset,
-                                 vreinterpretq_s16_u16(round_offset_vec),
-                                 &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      dst8 += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
-  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
-
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    uint8x8_t s0, s1, s2, s3, d01, d23;
-    uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
-      d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
-      d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
-      d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
-
-      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                            vreinterpretq_s16_u16(round_offset_vec), &d01,
-                            &d23);
-
-      store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0);
-      store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1);
-      store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0);
-      store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      dst8 += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    uint8x8_t s0, s1, s2, s3, d0_u8, d1_u8, d2_u8, d3_u8;
-    uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      const uint8_t *s = src;
-      CONV_BUF_TYPE *d = dst;
-      uint8_t *d_u8 = dst8;
-      int width = w;
-
-      do {
-        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
-        d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
-        d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
-        d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
-                              &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      dst8 += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
-                                                  int src_stride, int w, int h,
-                                                  ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
-  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
-
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    uint8x8_t s0, s1, s2, s3;
-    uint16x4_t d0, d1, d2, d3;
-
-    do {
-      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
-      d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
-      d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
-      d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
-
-      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    uint8x8_t s0, s1, s2, s3;
-    uint16x8_t d0, d1, d2, d3;
-
-    do {
-      const uint8_t *s = src;
-      CONV_BUF_TYPE *d = dst;
-      int width = w;
-
-      do {
-        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
-        d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
-        d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
-        d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
-                                        uint8_t *dst8, int dst8_stride, int w,
-                                        int h, ConvolveParams *conv_params) {
-  if (conv_params->do_average) {
-    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-      dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
-          src, src_stride, dst8, dst8_stride, w, h, conv_params);
-    } else {
-      dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
-                                         h, conv_params);
-    }
-  } else {
-    dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
-  }
-}
-
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE uint16x4_t convolve8_4_x(uint8x16_t samples,
-                                       const int8x8_t x_filter,
-                                       const uint8x16x2_t permute_tbl,
-                                       const int32x4_t round_offset) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  // First 4 output values.
-  sum = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
-}
-
-static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
-                                       const int8x8_t x_filter,
-                                       const uint8x16x3_t permute_tbl,
-                                       const int32x4_t round_offset) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum[2];
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  // First 4 output values.
-  sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
-  sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
-  // Second 4 output values.
-  sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
-  sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
-  // Narrow and re-pack.
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                               vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-  return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // (The extra -1 is needed because we halved the filter values.)
-  const int32x4_t round_offset_shim = vdupq_n_s32(
-      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-      uint8x8_t d01_u8, d23_u8;
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_x(s0, x_filter, permute_tbl, round_offset_shim);
-      d1 = convolve8_4_x(s1, x_filter, permute_tbl, round_offset_shim);
-      d2 = convolve8_4_x(s2, x_filter, permute_tbl, round_offset_shim);
-      d3 = convolve8_4_x(s3, x_filter, permute_tbl, round_offset_shim);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
-        d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
-        d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
-        d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_x_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // (The extra -1 is needed because we halved the filter values.)
-  const int32x4_t round_offset_shim = vdupq_n_s32(
-      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-      uint8x8_t d01_u8, d23_u8;
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_x(s0, x_filter, permute_tbl, round_offset_shim);
-      d1 = convolve8_4_x(s1, x_filter, permute_tbl, round_offset_shim);
-      d2 = convolve8_4_x(s2, x_filter, permute_tbl, round_offset_shim);
-      d3 = convolve8_4_x(s3, x_filter, permute_tbl, round_offset_shim);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                            round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
-        d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
-        d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
-        d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_x_neon(
-    const uint8_t *src, int src_stride, int w, int h,
-    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
-  // shifts - which are generally faster than rounding shifts on modern CPUs.
-  // (The extra -1 is needed because we halved the filter values.)
-  const int32x4_t round_offset_shim = vdupq_n_s32(
-      (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      uint16x4_t d0, d1, d2, d3;
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_x(s0, x_filter, permute_tbl, round_offset_shim);
-      d1 = convolve8_4_x(s1, x_filter, permute_tbl, round_offset_shim);
-      d2 = convolve8_4_x(s2, x_filter, permute_tbl, round_offset_shim);
-      d3 = convolve8_4_x(s3, x_filter, permute_tbl, round_offset_shim);
-
-      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        uint16x8_t d0, d1, d2, d3;
-
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
-        d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
-        d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
-        d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-#elif AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE uint16x4_t convolve8_4_x(uint8x16_t samples,
-                                       const int8x8_t x_filter,
-                                       const int32x4_t correction,
-                                       const uint8x16_t range_limit,
-                                       const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  // Permute samples ready for dot product.
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  // Accumulate dot product into 'correction' to account for range clamp.
-  sum = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], x_filter, 1);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
-}
-
-static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
-                                       const int8x8_t x_filter,
-                                       const int32x4_t correction,
-                                       const uint8x16_t range_limit,
-                                       const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum[2];
-
-  // Clamp sample range to [-128, 127] for 8-bit signed dot product.
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  // Permute samples ready for dot product. */
-  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  // Accumulate dot product into 'correction' to account for range clamp.
-  // First 4 output values.
-  sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
-  sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
-  // Second 4 output values.
-  sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
-  sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
-
-  // Narrow and re-pack.
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
-                               vshrn_n_s32(sum[1], ROUND0_BITS - 1));
-  return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  // Dot-product constants and other shims.
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
-  // Fold round_offset into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
-  int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
-                  (1 << ((ROUND0_BITS - 1) - 1)));
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-      uint8x8_t d01_u8, d23_u8;
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_x(s0, x_filter, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_x(s1, x_filter, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_x(s2, x_filter, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                               bck_offset, round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_x_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  // Dot-product constants and other shims.
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
-  // Fold round_offset into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
-  int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
-                  (1 << ((ROUND0_BITS - 1) - 1)));
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      uint16x4_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-      uint8x8_t d01_u8, d23_u8;
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_x(s0, x_filter, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_x(s1, x_filter, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_x(s2, x_filter, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-      load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                            round_offset_vec, &d01_u8, &d23_u8);
-
-      store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0);
-      store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1);
-      store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0);
-      store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        uint16x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
-
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_x_neon(
-    const uint8_t *src, int src_stride, int w, int h,
-    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  // Dot-product constants and other shims.
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  const int32_t correction_s32 = vaddlvq_s16(vshll_n_s8(x_filter, 7));
-  // Fold round_offset into the dot-product filter correction constant. The
-  // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
-  // rounding shifts - which are generally faster than rounding shifts on
-  // modern CPUs. (The extra -1 is needed because we halved the filter values.)
-  int32x4_t correction =
-      vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
-                  (1 << ((ROUND0_BITS - 1) - 1)));
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  int height = h;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-
-    do {
-      uint8x16_t s0, s1, s2, s3;
-      uint16x4_t d0, d1, d2, d3;
-
-      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_x(s0, x_filter, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_x(s1, x_filter, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_x(s2, x_filter, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-      store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int width = w;
-
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        uint16x8_t d0, d1, d2, d3;
-
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  }
-}
-
-#else  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE uint16x4_t convolve8_4_x(const int16x4_t s0, const int16x4_t s1,
-                                       const int16x4_t s2, const int16x4_t s3,
-                                       const int16x4_t s4, const int16x4_t s5,
-                                       const int16x4_t s6, const int16x4_t s7,
-                                       const int16x8_t x_filter,
-                                       const int16x4_t round_offset) {
-  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
-  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
-  int16x4_t sum = vmul_lane_s16(s0, x_filter_0_3, 0);
-  sum = vmla_lane_s16(sum, s1, x_filter_0_3, 1);
-  sum = vmla_lane_s16(sum, s2, x_filter_0_3, 2);
-  sum = vmla_lane_s16(sum, s3, x_filter_0_3, 3);
-  sum = vmla_lane_s16(sum, s4, x_filter_4_7, 0);
-  sum = vmla_lane_s16(sum, s5, x_filter_4_7, 1);
-  sum = vmla_lane_s16(sum, s6, x_filter_4_7, 2);
-  sum = vmla_lane_s16(sum, s7, x_filter_4_7, 3);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
-  return vreinterpret_u16_s16(res);
-}
-
-static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
-                                       const int16x8_t s2, const int16x8_t s3,
-                                       const int16x8_t s4, const int16x8_t s5,
-                                       const int16x8_t s6, const int16x8_t s7,
-                                       const int16x8_t x_filter,
-                                       const int16x8_t round_offset) {
-  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
-  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
-
-  int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
-  sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
-  sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
-  sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
-  sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
-  sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
-  sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
-  sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
-  return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  const uint8_t *s;
-  uint8_t *d_u8;
-  CONV_BUF_TYPE *d;
-  int width;
-  int height = h;
-
-  uint8x8_t t0;
-#if AOM_ARCH_AARCH64
-  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x4_t d0, dd0;
-    uint8x8_t d01;
-#if AOM_ARCH_AARCH64
-    int16x4_t s9, s10;
-    uint16x4_t d1, d2, d3, dd1, dd2, dd3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      d = dst_ptr;
-      d_u8 = dst8_ptr;
-      width = w;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-
-      s = src_ptr + 7;
-
-      do {
-        load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
-        transpose_u8_4x4(&t0, &t1);
-
-        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s9 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s10 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-
-        d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve8_4_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve8_4_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve8_4_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                           vget_low_s16(round_offset_vec));
-
-        transpose_u16_4x4d(&d0, &d1, &d2, &d3);
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-
-        __builtin_prefetch(d_u8 + 0 * dst8_stride);
-        __builtin_prefetch(d_u8 + 1 * dst8_stride);
-        __builtin_prefetch(d_u8 + 2 * dst8_stride);
-        __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
-        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d01, &d23);
-
-        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
-        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
-        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
-        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4;
-        d += 4;
-        d_u8 += 4;
-        width -= 4;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-#else   // !AOM_ARCH_AARCH64
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      __builtin_prefetch(d);
-
-      s = src_ptr + 8;
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-        s5 = vext_s16(s4, s8, 1);  // a5 a6 a7 a8
-        s6 = vext_s16(s4, s8, 2);  // a6 a7 a8 a9
-        s7 = vext_s16(s4, s8, 3);  // a7 a8 a9 a10
-
-        d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           vget_low_s16(round_offset_vec));
-
-        __builtin_prefetch(d);
-        __builtin_prefetch(d_u8);
-
-        dd0 = vld1_u16(d);
-
-        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
-                                 vget_low_s16(round_offset_vec), &d01);
-
-        store_u8_4x1(d_u8, d01, 0);
-
-        s0 = s4;
-        s4 = s8;
-        s += 4;
-        d += 4;
-        d_u8 += 4;
-        width -= 4;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      dst8_ptr += dst8_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x8_t d0, dd0;
-    uint8x8_t d0_u8;
-
-    do {
-      d = dst_ptr;
-      d_u8 = dst8_ptr;
-      width = w;
-
-#if AOM_ARCH_AARCH64
-      int16x8_t s9, s10, s11, s12, s13, s14;
-      uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-      uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
-      s = src_ptr + 7;
-
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           round_offset_vec);
-        d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                           round_offset_vec);
-        d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                           round_offset_vec);
-        d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                           round_offset_vec);
-        d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                           round_offset_vec);
-        d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
-                           round_offset_vec);
-        d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
-                           round_offset_vec);
-        d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
-                           round_offset_vec);
-
-        transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
-        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
-                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
-                                 &d6_u8, &d7_u8);
-
-        store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
-                     d7_u8);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-      dst8_ptr += 8 * dst8_stride;
-      height -= 8;
-#else   // !AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      s = src_ptr + 8;
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
-        s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           round_offset_vec);
-
-        dd0 = vld1q_u16(d);
-
-        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
-                                 round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-
-        s0 = s8;
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      dst8_ptr += dst8_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_x_avg_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  uint8_t *dst8_ptr = dst8;
-  int dst_stride = conv_params->dst_stride;
-  const uint8_t *s;
-  uint8_t *d_u8;
-  CONV_BUF_TYPE *d;
-  int width;
-  int height = h;
-
-  uint8x8_t t0;
-#if AOM_ARCH_AARCH64
-  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x4_t d0, dd0;
-    uint8x8_t d01;
-#if AOM_ARCH_AARCH64
-    int16x4_t s9, s10;
-    uint16x4_t d1, d2, d3, dd1, dd2, dd3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      d = dst_ptr;
-      d_u8 = dst8_ptr;
-      width = w;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-
-      s = src_ptr + 7;
-
-      do {
-        load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
-        transpose_u8_4x4(&t0, &t1);
-
-        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s9 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s10 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-
-        d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve8_4_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve8_4_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve8_4_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                           vget_low_s16(round_offset_vec));
-
-        transpose_u16_4x4d(&d0, &d1, &d2, &d3);
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-
-        __builtin_prefetch(d_u8 + 0 * dst8_stride);
-        __builtin_prefetch(d_u8 + 1 * dst8_stride);
-        __builtin_prefetch(d_u8 + 2 * dst8_stride);
-        __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
-        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d01, &d23);
-
-        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
-        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
-        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
-        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4;
-        d += 4;
-        d_u8 += 4;
-        width -= 4;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      dst8_ptr += 4 * dst8_stride;
-      height -= 4;
-#else   // !AOM_ARCH_AARCH64
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      __builtin_prefetch(d);
-
-      s = src_ptr + 8;
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-        s5 = vext_s16(s4, s8, 1);  // a5 a6 a7 a8
-        s6 = vext_s16(s4, s8, 2);  // a6 a7 a8 a9
-        s7 = vext_s16(s4, s8, 3);  // a7 a8 a9 a10
-
-        d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           vget_low_s16(round_offset_vec));
-
-        __builtin_prefetch(d);
-        __builtin_prefetch(d_u8);
-
-        dd0 = vld1_u16(d);
-
-        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
-
-        store_u8_4x1(d_u8, d01, 0);
-
-        s0 = s4;
-        s4 = s8;
-        s += 4;
-        d += 4;
-        d_u8 += 4;
-        width -= 4;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      dst8_ptr += dst8_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x8_t d0, dd0;
-    uint8x8_t d0_u8;
-
-    do {
-      d = dst_ptr;
-      d_u8 = dst8_ptr;
-      width = w;
-
-#if AOM_ARCH_AARCH64
-      int16x8_t s9, s10, s11, s12, s13, s14;
-      uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-      uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
-      s = src_ptr + 7;
-
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           round_offset_vec);
-        d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                           round_offset_vec);
-        d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                           round_offset_vec);
-        d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                           round_offset_vec);
-        d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                           round_offset_vec);
-        d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
-                           round_offset_vec);
-        d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
-                           round_offset_vec);
-        d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
-                           round_offset_vec);
-
-        transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-
-        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
-        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
-                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
-
-        store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
-                     d7_u8);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-      dst8_ptr += 8 * dst8_stride;
-      height -= 8;
-#else   // !AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      s = src_ptr + 8;
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
-        s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           round_offset_vec);
-
-        dd0 = vld1q_u16(d);
-
-        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-
-        s0 = s8;
-        s += 8;
-        d += 8;
-        d_u8 += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      dst8_ptr += dst8_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_x_neon(
-    const uint8_t *src, int src_stride, int w, int h,
-    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
-    ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  // Horizontal filter.
-  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
-
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - horiz_offset;
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  const uint8_t *s;
-  CONV_BUF_TYPE *d;
-  int width;
-  int height = h;
-
-  uint8x8_t t0;
-#if AOM_ARCH_AARCH64
-  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x4_t d0;
-#if AOM_ARCH_AARCH64
-    int16x4_t s9, s10;
-    uint16x4_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      d = dst_ptr;
-      width = w;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-#if AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-
-      s = src_ptr + 7;
-
-      do {
-        load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
-        transpose_u8_4x4(&t0, &t1);
-
-        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s9 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s10 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-
-        d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve8_4_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve8_4_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve8_4_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                           vget_low_s16(round_offset_vec));
-
-        transpose_u16_4x4d(&d0, &d1, &d2, &d3);
-
-        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4;
-        d += 4;
-        width -= 4;
-      } while (width != 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-#else   // !AOM_ARCH_AARCH64
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      __builtin_prefetch(d);
-
-      s = src_ptr + 8;
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-        s5 = vext_s16(s4, s8, 1);  // a5 a6 a7 a8
-        s6 = vext_s16(s4, s8, 2);  // a6 a7 a8 a9
-        s7 = vext_s16(s4, s8, 3);  // a7 a8 a9 a10
-
-        d0 = convolve8_4_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           vget_low_s16(round_offset_vec));
-
-        vst1_u16(d, d0);
-
-        s0 = s4;
-        s4 = s8;
-        s += 4;
-        d += 4;
-        width -= 4;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-    uint16x8_t d0;
-
-    do {
-      d = dst_ptr;
-      width = w;
-
-#if AOM_ARCH_AARCH64
-      int16x8_t s9, s10, s11, s12, s13, s14;
-      uint16x8_t d1, d2, d3, d4, d5, d6, d7;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
-      s = src_ptr + 7;
-
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           round_offset_vec);
-        d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                           round_offset_vec);
-        d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                           round_offset_vec);
-        d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                           round_offset_vec);
-        d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                           round_offset_vec);
-        d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
-                           round_offset_vec);
-        d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
-                           round_offset_vec);
-        d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
-                           round_offset_vec);
-
-        transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-      height -= 8;
-#else   // !AOM_ARCH_AARCH64
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      s = src_ptr + 8;
-
-      do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
-
-        s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                           round_offset_vec);
-
-        vst1q_u16(d, d0);
-
-        s0 = s8;
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-      height--;
-#endif  // AOM_ARCH_AARCH64
-    } while (height != 0);
-  }
-}
-
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
-                                  uint8_t *dst8, int dst8_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const int subpel_x_qn,
-                                  ConvolveParams *conv_params) {
-  if (conv_params->do_average) {
-    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-      dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
-                                            w, h, filter_params_x, subpel_x_qn,
-                                            conv_params);
-    } else {
-      dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
-                                   filter_params_x, subpel_x_qn, conv_params);
-    }
-  } else {
-    dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
-                             subpel_x_qn, conv_params);
-  }
-}
-
-static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
-                                       const int16x4_t s2, const int16x4_t s3,
-                                       const int16x4_t s4, const int16x4_t s5,
-                                       const int16x8_t y_filter,
-                                       const int16x4_t round_offset) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  // Filter values at indices 0 and 7 are 0.
-  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
-  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
-  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
-  sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
-  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
-  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
-  return vreinterpret_u16_s16(res);
-}
-
-static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
-                                       const int16x8_t s2, const int16x8_t s3,
-                                       const int16x8_t s4, const int16x8_t s5,
-                                       const int16x8_t y_filter,
-                                       const int16x8_t round_offset) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  // Filter values at indices 0 and 7 are 0.
-  int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
-  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
-  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
-  sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
-  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
-  sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
-  return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
-    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
-    const int dst8_stride, int w, int h, const int16x8_t y_filter,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int width = w;
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5;
-    uint16x4_t d0, dd0;
-    uint8x8_t t0, t1, t2, t3, t4, d01;
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8;
-    uint16x4_t d1, d2, d3, dd1, dd2, dd3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-      t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-      t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-      t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-      t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-
-      s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
-        s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-        s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-        s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-        d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d01, &d23);
-
-        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
-        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
-        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
-        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        d_u8 += 4 * dst8_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s);
-        s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
-        d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        dd0 = vld1_u16(d);
-
-        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
-                                 vget_low_s16(round_offset_vec), &d01);
-
-        store_u8_4x1(d_u8, d01, 0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        d_u8 += dst8_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 4;
-      dst_ptr += 4;
-      dst8_ptr += 4;
-      width -= 4;
-    } while (width != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint16x8_t d0, dd0;
-    uint8x8_t d0_u8, t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8, s9, s10, s11, s12;
-    uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-    uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr + (5 * src_stride);
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-        d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
-        d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
-        d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
-        d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
-        d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
-        d6 =
-            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
-        d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
-                           round_offset_vec);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
-        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
-                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
-                                 &d6_u8, &d7_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
-        d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-
-        dd0 = vld1q_u16(d);
-
-        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
-                                 round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
-    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
-    const int dst8_stride, int w, int h, const int16x8_t y_filter,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int width = w;
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5;
-    uint16x4_t d0, dd0;
-    uint8x8_t t0, t1, t2, t3, t4, d01;
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8;
-    uint16x4_t d1, d2, d3, dd1, dd2, dd3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-      t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-      t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-      t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-      t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-
-      s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
-        s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-        s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-        s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-        d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d01, &d23);
-
-        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
-        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
-        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
-        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        d_u8 += 4 * dst8_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s);
-        s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
-        d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        dd0 = vld1_u16(d);
-
-        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
-
-        store_u8_4x1(d_u8, d01, 0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        d_u8 += dst8_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 4;
-      dst_ptr += 4;
-      dst8_ptr += 4;
-      width -= 4;
-    } while (width != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint16x8_t d0, dd0;
-    uint8x8_t d0_u8, t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8, s9, s10, s11, s12;
-    uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-    uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr + (5 * src_stride);
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-        d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
-        d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
-        d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
-        d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
-        d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
-        d6 =
-            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
-        d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
-                           round_offset_vec);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
-        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
-                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
-        d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-
-        dd0 = vld1q_u16(d);
-
-        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
-                                                 int src_stride, int w, int h,
-                                                 const int16x8_t y_filter,
-                                                 ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int width = w;
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5;
-    uint16x4_t d0;
-    uint8x8_t t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
-    int16x4_t s6, s7, s8;
-    uint16x4_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int height = h;
-
-      t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-      t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-      t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-      t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-      t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-
-      s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-
-      s += 5 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
-        s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-        s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-        s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-        d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s);
-        s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
-        d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        vst1_u16(d, d0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 4;
-      dst_ptr += 4;
-      width -= 4;
-    } while (width != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5;
-    uint16x8_t d0;
-    uint8x8_t t0, t1, t2, t3, t4;
-#if AOM_ARCH_AARCH64
-    int16x8_t s6, s7, s8, s9, s10, s11, s12;
-    uint16x8_t d1, d2, d3, d4, d5, d6, d7;
-    uint8x8_t t5, t6, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr + (5 * src_stride);
-      CONV_BUF_TYPE *d = dst_ptr;
-      int height = h;
-
-      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-        d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
-        d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
-        d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
-        d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
-        d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
-        d6 =
-            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
-        d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
-                           round_offset_vec);
-
-        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
-#else   // !AOM_ARCH_AARCH64
-        s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
-        d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-
-        vst1q_u16(d, d0);
-
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
-                                       const int16x4_t s2, const int16x4_t s3,
-                                       const int16x4_t s4, const int16x4_t s5,
-                                       const int16x4_t s6, const int16x4_t s7,
-                                       const int16x8_t y_filter,
-                                       const int16x4_t round_offset) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
-  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
-  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
-  sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
-  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
-  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
-  sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
-  sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
-  return vreinterpret_u16_s16(res);
-}
-
-static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
-                                       const int16x8_t s2, const int16x8_t s3,
-                                       const int16x8_t s4, const int16x8_t s5,
-                                       const int16x8_t s6, const int16x8_t s7,
-                                       const int16x8_t y_filter,
-                                       const int16x8_t round_offset) {
-  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
-  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
-
-  int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
-  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
-  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
-  sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
-  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
-  sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
-  sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
-  sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
-
-  // We halved the convolution filter values so -1 from the right shift.
-  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
-  return vreinterpretq_u16_s16(res);
-}
-
-static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
-    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
-    const int dst8_stride, int w, int h, const int16x8_t y_filter,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  const uint16_t fwd_offset = conv_params->fwd_offset;
-  const uint16_t bck_offset = conv_params->bck_offset;
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int width = w;
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t d0, dd0;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10;
-    uint16x4_t d1, d2, d3, dd1, dd2, dd3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-
-      t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-      t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-      t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-      t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-      t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-      t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
-      t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
-
-      s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-        s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-        s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-        s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-        d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-
-        __builtin_prefetch(d_u8 + 0 * dst8_stride);
-        __builtin_prefetch(d_u8 + 1 * dst8_stride);
-        __builtin_prefetch(d_u8 + 2 * dst8_stride);
-        __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
-        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d01, &d23);
-
-        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
-        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
-        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
-        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        d_u8 += 4 * dst8_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s);
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
-        d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        __builtin_prefetch(d);
-
-        dd0 = vld1_u16(d);
-
-        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
-                                 vget_low_s16(round_offset_vec), &d01);
-
-        store_u8_4x1(d_u8, d01, 0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        d_u8 += dst8_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 4;
-      dst_ptr += 4;
-      dst8_ptr += 4;
-      width -= 4;
-    } while (width != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0, dd0;
-    uint8x8_t d0_u8, t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10, s11, s12, s13, s14;
-    uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-    uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-      __builtin_prefetch(s + 4 * src_stride);
-      __builtin_prefetch(s + 5 * src_stride);
-      __builtin_prefetch(s + 6 * src_stride);
-      __builtin_prefetch(s + 7 * src_stride);
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        __builtin_prefetch(dst_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_ptr + 1 * dst_stride);
-        __builtin_prefetch(dst_ptr + 2 * dst_stride);
-        __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-        d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           round_offset_vec);
-        d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                           round_offset_vec);
-        d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                           round_offset_vec);
-        d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                           round_offset_vec);
-        d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
-                           round_offset_vec);
-        d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
-                           round_offset_vec);
-        d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
-                           round_offset_vec);
-        d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
-                           round_offset_vec);
-
-        __builtin_prefetch(d + 0 * dst8_stride);
-        __builtin_prefetch(d + 1 * dst8_stride);
-        __builtin_prefetch(d + 2 * dst8_stride);
-        __builtin_prefetch(d + 3 * dst8_stride);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
-                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
-                                 &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
-        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
-                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
-                                 &d6_u8, &d7_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
-        __builtin_prefetch(dst_ptr);
-
-        d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           round_offset_vec);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-
-        __builtin_prefetch(d);
-
-        dd0 = vld1q_u16(d);
-
-        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
-                                 round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
-    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
-    const int dst8_stride, int w, int h, const int16x8_t y_filter,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int width = w;
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t d0, dd0;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10;
-    uint16x4_t d1, d2, d3, dd1, dd2, dd3;
-    uint8x8_t d23;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-
-      t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-      t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-      t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-      t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-      t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-      t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
-      t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
-
-      s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-        s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-        s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-        s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-        d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-
-        __builtin_prefetch(d_u8 + 0 * dst8_stride);
-        __builtin_prefetch(d_u8 + 1 * dst8_stride);
-        __builtin_prefetch(d_u8 + 2 * dst8_stride);
-        __builtin_prefetch(d_u8 + 3 * dst8_stride);
-
-        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d01, &d23);
-
-        store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
-        store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
-        store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
-        store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        d_u8 += 4 * dst8_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s);
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
-        d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        __builtin_prefetch(d);
-
-        dd0 = vld1_u16(d);
-
-        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
-
-        store_u8_4x1(d_u8, d01, 0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        d_u8 += dst8_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 4;
-      dst_ptr += 4;
-      dst8_ptr += 4;
-      width -= 4;
-    } while (width != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0, dd0;
-    uint8x8_t d0_u8, t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10, s11, s12, s13, s14;
-    uint16x8_t d1, d2, d3, d4, d5, d6, d7, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-    uint8x8_t d1_u8, d2_u8, d3_u8, d4_u8, d5_u8, d6_u8, d7_u8, t7;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      uint8_t *d_u8 = dst8_ptr;
-      int height = h;
-
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-      __builtin_prefetch(s + 4 * src_stride);
-      __builtin_prefetch(s + 5 * src_stride);
-      __builtin_prefetch(s + 6 * src_stride);
-      __builtin_prefetch(s + 7 * src_stride);
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        __builtin_prefetch(dst_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_ptr + 1 * dst_stride);
-        __builtin_prefetch(dst_ptr + 2 * dst_stride);
-        __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-        d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           round_offset_vec);
-        d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                           round_offset_vec);
-        d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                           round_offset_vec);
-        d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                           round_offset_vec);
-        d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
-                           round_offset_vec);
-        d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
-                           round_offset_vec);
-        d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
-                           round_offset_vec);
-        d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
-                           round_offset_vec);
-
-        __builtin_prefetch(d + 0 * dst8_stride);
-        __builtin_prefetch(d + 1 * dst8_stride);
-        __builtin_prefetch(d + 2 * dst8_stride);
-        __builtin_prefetch(d + 3 * dst8_stride);
-
-        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
-                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
-        d_u8 += 4 * dst8_stride;
-
-        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
-
-        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
-                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
-
-        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
-        d_u8 += 4 * dst8_stride;
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
-        __builtin_prefetch(dst_ptr);
-
-        d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           round_offset_vec);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-
-        __builtin_prefetch(d);
-
-        dd0 = vld1q_u16(d);
-
-        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
-
-        vst1_u8(d_u8, d0_u8);
-        d_u8 += dst8_stride;
-
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      dst8_ptr += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
-                                                 int src_stride, int w, int h,
-                                                 const int16x8_t y_filter,
-                                                 ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
-  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
-                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
-  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
-
-  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  int width = w;
-
-  if (w == 4 || h == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x4_t d0;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
-    int16x4_t s8, s9, s10;
-    uint16x4_t d1, d2, d3;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int height = h;
-
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-
-      t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-      t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-      t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-      t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-      t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
-      t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
-      t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
-
-      s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
-
-      __builtin_prefetch(d + 0 * dst_stride);
-      __builtin_prefetch(d + 1 * dst_stride);
-      __builtin_prefetch(d + 2 * dst_stride);
-      __builtin_prefetch(d + 3 * dst_stride);
-
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
-        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
-        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
-        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
-
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-        s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-        s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-        s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-        d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                           vget_low_s16(round_offset_vec));
-        d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-#else   // !AOM_ARCH_AARCH64
-        t0 = load_unaligned_u8_4x1(s);
-        s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-
-        d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           vget_low_s16(round_offset_vec));
-
-        vst1_u16(d, d0);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 4;
-      dst_ptr += 4;
-      width -= 4;
-    } while (width != 0);
-  } else {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint16x8_t d0;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-#if AOM_ARCH_AARCH64
-    int16x8_t s8, s9, s10, s11, s12, s13, s14;
-    uint16x8_t d1, d2, d3, d4, d5, d6, d7;
-    uint8x8_t t7;
-#endif  // AOM_ARCH_AARCH64
-
-    do {
-      const uint8_t *s = src_ptr;
-      CONV_BUF_TYPE *d = dst_ptr;
-      int height = h;
-
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-      __builtin_prefetch(s + 4 * src_stride);
-      __builtin_prefetch(s + 5 * src_stride);
-      __builtin_prefetch(s + 6 * src_stride);
-      __builtin_prefetch(s + 7 * src_stride);
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s += 7 * src_stride;
-
-      do {
-#if AOM_ARCH_AARCH64
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        __builtin_prefetch(dst_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_ptr + 1 * dst_stride);
-        __builtin_prefetch(dst_ptr + 2 * dst_stride);
-        __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-        d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           round_offset_vec);
-        d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                           round_offset_vec);
-        d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                           round_offset_vec);
-        d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                           round_offset_vec);
-        d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
-                           round_offset_vec);
-        d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
-                           round_offset_vec);
-        d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
-                           round_offset_vec);
-        d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
-                           round_offset_vec);
-
-        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
-#else   // !AOM_ARCH_AARCH64
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-
-        __builtin_prefetch(dst_ptr);
-
-        d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                           round_offset_vec);
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-
-        vst1q_u16(d, d0);
-
-        s += src_stride;
-        d += dst_stride;
-        height--;
-#endif  // AOM_ARCH_AARCH64
-      } while (height != 0);
-      src_ptr += 8;
-      dst_ptr += 8;
-      width -= 8;
-    } while (width != 0);
-  }
-}
-
-void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
-                                  uint8_t *dst8, int dst8_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  assert(w % 4 == 0);
-  assert(h % 4 == 0);
-
-  // Vertical filter.
-  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  // Filter values are even, so downshift by 1 to reduce intermediate
-  // precision requirements.
-  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
-
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const uint8_t *src_ptr = src - (vert_offset * src_stride);
-
-  if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
-    if (conv_params->do_average) {
-      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-        dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
-            src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
-            conv_params);
-      } else {
-        dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
-                                          dst8, dst8_stride, w, h, y_filter,
-                                          conv_params);
-      }
-    } else {
-      dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
-                                    y_filter, conv_params);
-    }
-  } else {
-    if (conv_params->do_average) {
-      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
-        dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
-                                                   dst8_stride, w, h, y_filter,
-                                                   conv_params);
-      } else {
-        dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
-                                          dst8_stride, w, h, y_filter,
-                                          conv_params);
-      }
-    } else {
-      dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
-                                    conv_params);
-    }
-  }
-}
diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
index 3694763..2b0274c 100644
--- a/av1/common/arm/reconinter_neon.c
+++ b/av1/common/arm/reconinter_neon.c
@@ -12,6 +12,7 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <stdbool.h>
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
@@ -20,6 +21,93 @@
 #include "av1/common/blockd.h"
 #include "config/av1_rtcd.h"
 
+static AOM_INLINE void diffwtd_mask_d16_neon(
+    uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round));
+
+  if (w >= 16) {
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        uint16x8_t s0_lo = vld1q_u16(src0 + j);
+        uint16x8_t s1_lo = vld1q_u16(src1 + j);
+        uint16x8_t s0_hi = vld1q_u16(src0 + j + 8);
+        uint16x8_t s1_hi = vld1q_u16(src1 + j + 8);
+
+        uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec);
+        uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec);
+        uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
+        uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
+        uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+        uint8x16_t m;
+        if (inverse) {
+          m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
+        } else {
+          m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+        }
+
+        vst1q_u8(mask, m);
+
+        mask += 16;
+        j += 16;
+      } while (j < w);
+      src0 += src0_stride;
+      src1 += src1_stride;
+    } while (++i < h);
+  } else if (w == 8) {
+    int i = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src0);
+      uint16x8_t s1 = vld1q_u16(src1);
+
+      uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
+      uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+      uint8x8_t m;
+      if (inverse) {
+        m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8);  // Saturating to 0
+      } else {
+        m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
+      }
+
+      vst1_u8(mask, m);
+
+      mask += 8;
+      src0 += src0_stride;
+      src1 += src1_stride;
+    } while (++i < h);
+  } else if (w == 4) {
+    int i = 0;
+    do {
+      uint16x8_t s0 =
+          vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride));
+      uint16x8_t s1 =
+          vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride));
+
+      uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
+      uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+      uint8x8_t m;
+      if (inverse) {
+        m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8);  // Saturating to 0
+      } else {
+        m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
+      }
+
+      vst1_u8(mask, m);
+
+      mask += 8;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      i += 2;
+    } while (i < h);
+  }
+}
+
 void av1_build_compound_diffwtd_mask_d16_neon(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
@@ -27,60 +115,103 @@
   assert(h >= 4);
   assert(w >= 4);
   assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
-  const int round =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
-  uint16x8_t diff_q, tmp0, tmp1;
-  uint8x8_t diff_d, diff_select;
-  const CONV_BUF_TYPE *src0_1, *src1_1;
-  const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
-  const uint8x8_t dup_38 = vdup_n_u8(38);
-  const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+
   if (mask_type == DIFFWTD_38) {
-    diff_select = vdup_n_u8(255);
-  } else {
-    diff_select = vdup_n_u8(0);
+    diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
+                          src1_stride, h, w, conv_params, bd);
+  } else {  // mask_type == DIFFWTD_38_INV
+    diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
+                          src1_stride, h, w, conv_params, bd);
   }
-  if (w >= 8) {
-    for (int i = 0; i < h; ++i) {
-      src0_1 = src0;
-      src1_1 = src1;
-      for (int j = 0; j < w; j += 8) {
-        __builtin_prefetch(src0_1);
-        __builtin_prefetch(src1_1);
-        diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
-        diff_q = vrshlq_u16(diff_q, dup_round);
-        diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
-        diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
-        diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
-        vst1_u8(mask, diff_d);
-        src0_1 += 8;
-        src1_1 += 8;
-        mask += 8;
-      }
+}
+
+static AOM_INLINE void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
+                                         const uint8_t *src0, int src0_stride,
+                                         const uint8_t *src1, int src1_stride,
+                                         int h, int w) {
+  if (w >= 16) {
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        uint8x16_t s0 = vld1q_u8(src0 + j);
+        uint8x16_t s1 = vld1q_u8(src1 + j);
+
+        uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+        uint8x16_t m;
+        if (inverse) {
+          m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
+        } else {
+          m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+        }
+
+        vst1q_u8(mask, m);
+
+        mask += 16;
+        j += 16;
+      } while (j < w);
       src0 += src0_stride;
       src1 += src1_stride;
-    }
+    } while (++i < h);
+  } else if (w == 8) {
+    int i = 0;
+    do {
+      uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride));
+      uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride));
+
+      uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+      uint8x16_t m;
+      if (inverse) {
+        m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
+      } else {
+        m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+      }
+
+      vst1q_u8(mask, m);
+
+      mask += 16;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      i += 2;
+    } while (i < h);
   } else if (w == 4) {
-    for (int i = 0; i < h; i += 2) {
-      src0_1 = src0;
-      src1_1 = src1;
-      __builtin_prefetch(src0_1 + 0 * src0_stride);
-      __builtin_prefetch(src0_1 + 1 * src0_stride);
-      __builtin_prefetch(src1_1 + 0 * src1_stride);
-      __builtin_prefetch(src1_1 + 1 * src1_stride);
-      tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
-                          vld1_u16(src0_1 + (1 * src0_stride)));
-      tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
-                          vld1_u16(src1_1 + (1 * src1_stride)));
-      diff_q = vabdq_u16(tmp0, tmp1);
-      diff_q = vrshlq_u16(diff_q, dup_round);
-      diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
-      diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
-      diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
-      vst1_u8(mask, diff_d);
-      src0 += src0_stride * 2;
-      src1 += src1_stride * 2;
-      mask += w * 2;
-    }
+    int i = 0;
+    do {
+      uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride);
+      uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride);
+
+      uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+      uint8x16_t m;
+      if (inverse) {
+        m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
+      } else {
+        m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+      }
+
+      vst1q_u8(mask, m);
+
+      mask += 16;
+      src0 += 4 * src0_stride;
+      src1 += 4 * src1_stride;
+      i += 4;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_neon(uint8_t *mask,
+                                          DIFFWTD_MASK_TYPE mask_type,
+                                          const uint8_t *src0, int src0_stride,
+                                          const uint8_t *src1, int src1_stride,
+                                          int h, int w) {
+  assert(h % 4 == 0);
+  assert(w % 4 == 0);
+  assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
+
+  if (mask_type == DIFFWTD_38) {
+    diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
+                      src1_stride, h, w);
+  } else {  // mask_type == DIFFWTD_38_INV
+    diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
+                      src1_stride, h, w);
   }
 }
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
index 8d190fb..cf488a9 100644
--- a/av1/common/arm/reconintra_neon.c
+++ b/av1/common/arm/reconintra_neon.c
@@ -17,6 +17,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/sum_neon.h"
 
+#define MAX_UPSAMPLE_SZ 16
+
 DECLARE_ALIGNED(16, const int8_t,
                 av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = {
   {
@@ -153,3 +155,185 @@
     dst += stride;
   }
 }
+
+void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
+  if (!strength) return;
+  assert(sz >= 0 && sz <= 129);
+
+  uint8_t edge[160];  // Max value of sz + enough padding for vector accesses.
+  memcpy(edge + 1, p, sz * sizeof(*p));
+
+  // Populate extra space appropriately.
+  edge[0] = edge[1];
+  edge[sz + 1] = edge[sz];
+  edge[sz + 2] = edge[sz];
+
+  // Don't overwrite first pixel.
+  uint8_t *dst = p + 1;
+  sz--;
+
+  if (strength == 1) {  // Filter: {4, 8, 4}.
+    const uint8_t *src = edge + 1;
+
+    while (sz >= 8) {
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+      uint8x8_t s2 = vld1_u8(src + 2);
+
+      // Make use of the identity:
+      // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+      uint16x8_t t0 = vaddl_u8(s0, s2);
+      uint16x8_t t1 = vaddl_u8(s1, s1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint8x8_t res = vrshrn_n_u16(sum, 2);
+
+      vst1_u8(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+      uint8x8_t s2 = vld1_u8(src + 2);
+
+      uint16x8_t t0 = vaddl_u8(s0, s2);
+      uint16x8_t t1 = vaddl_u8(s1, s1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint8x8_t res = vrshrn_n_u16(sum, 2);
+
+      // Mask off out-of-bounds indices.
+      uint8x8_t current_dst = vld1_u8(dst);
+      uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+      res = vbsl_u8(mask, res, current_dst);
+
+      vst1_u8(dst, res);
+    }
+  } else if (strength == 2) {  // Filter: {5, 6, 5}.
+    const uint8_t *src = edge + 1;
+
+    const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } };
+
+    while (sz >= 8) {
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+      uint8x8_t s2 = vld1_u8(src + 2);
+
+      uint16x8_t accum = vmull_u8(s0, filter.val[0]);
+      accum = vmlal_u8(accum, s1, filter.val[1]);
+      accum = vmlal_u8(accum, s2, filter.val[2]);
+      uint8x8_t res = vrshrn_n_u16(accum, 4);
+
+      vst1_u8(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+      uint8x8_t s2 = vld1_u8(src + 2);
+
+      uint16x8_t accum = vmull_u8(s0, filter.val[0]);
+      accum = vmlal_u8(accum, s1, filter.val[1]);
+      accum = vmlal_u8(accum, s2, filter.val[2]);
+      uint8x8_t res = vrshrn_n_u16(accum, 4);
+
+      // Mask off out-of-bounds indices.
+      uint8x8_t current_dst = vld1_u8(dst);
+      uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+      res = vbsl_u8(mask, res, current_dst);
+
+      vst1_u8(dst, res);
+    }
+  } else {  // Filter {2, 4, 4, 4, 2}.
+    const uint8_t *src = edge;
+
+    while (sz >= 8) {
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+      uint8x8_t s2 = vld1_u8(src + 2);
+      uint8x8_t s3 = vld1_u8(src + 3);
+      uint8x8_t s4 = vld1_u8(src + 4);
+
+      // Make use of the identity:
+      // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+      uint16x8_t t0 = vaddl_u8(s0, s4);
+      uint16x8_t t1 = vaddl_u8(s1, s2);
+      t1 = vaddw_u8(t1, s3);
+      t1 = vaddq_u16(t1, t1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint8x8_t res = vrshrn_n_u16(sum, 3);
+
+      vst1_u8(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint8x8_t s0 = vld1_u8(src);
+      uint8x8_t s1 = vld1_u8(src + 1);
+      uint8x8_t s2 = vld1_u8(src + 2);
+      uint8x8_t s3 = vld1_u8(src + 3);
+      uint8x8_t s4 = vld1_u8(src + 4);
+
+      uint16x8_t t0 = vaddl_u8(s0, s4);
+      uint16x8_t t1 = vaddl_u8(s1, s2);
+      t1 = vaddw_u8(t1, s3);
+      t1 = vaddq_u16(t1, t1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint8x8_t res = vrshrn_n_u16(sum, 3);
+
+      // Mask off out-of-bounds indices.
+      uint8x8_t current_dst = vld1_u8(dst);
+      uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+      res = vbsl_u8(mask, res, current_dst);
+
+      vst1_u8(dst, res);
+    }
+  }
+}
+
+void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
+  if (!sz) return;
+
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint8_t edge[MAX_UPSAMPLE_SZ + 3];
+  const uint8_t *src = edge;
+
+  // Copy p[-1..(sz-1)] and pad out both ends.
+  edge[0] = p[-1];
+  edge[1] = p[-1];
+  memcpy(edge + 2, p, sz);
+  edge[sz + 2] = p[sz - 1];
+  p[-2] = p[-1];
+
+  uint8_t *dst = p - 1;
+
+  do {
+    uint8x8_t s0 = vld1_u8(src);
+    uint8x8_t s1 = vld1_u8(src + 1);
+    uint8x8_t s2 = vld1_u8(src + 2);
+    uint8x8_t s3 = vld1_u8(src + 3);
+
+    int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
+    int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
+    t1 = vmulq_n_s16(t1, 9);
+    t1 = vsubq_s16(t1, t0);
+
+    uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } };
+
+    vst2_u8(dst, res);
+
+    src += 8;
+    dst += 16;
+    sz -= 8;
+  } while (sz > 0);
+}
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index 5f6d214..076981b 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -15,10 +15,61 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/resize.h"
-#include "av1/common/arm/convolve_neon.h"
 #include "config/av1_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+
+  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filter) {
+  int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter);
+}
+
 static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
                                               const int src_stride,
                                               uint8_t *dst,
@@ -192,15 +243,16 @@
   do {
     load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
                 &s[6], &s[7]);
-    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                                   &s[6], &s[7]);
     x = width_hor;
 
     do {
       src += 8;
       load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
                   &s[12], &s[13]);
-      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
-                       &s[13]);
+      transpose_elems_inplace_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                                     &s[12], &s[13]);
 
       d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
       d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
@@ -210,7 +262,7 @@
       // 10 11 12 13 50 51 52 53
       // 20 21 22 23 60 61 62 63
       // 30 31 32 33 70 71 72 73
-      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+      transpose_elems_inplace_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
       vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
                     0);
       vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
@@ -308,7 +360,8 @@
   do {
     load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
                 &s[6], &s[7]);
-    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+    transpose_elems_u8_4x8(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
+                           &s[0], &s[1], &s[2], &s[3]);
     x = width_hor;
 
     do {
@@ -316,8 +369,8 @@
       src += 8;
       load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
                   &s[10], &s[11]);
-      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
-                       &s[11]);
+      transpose_elems_inplace_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                                     &s[10], &s[11]);
 
       d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
       d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
@@ -453,14 +506,16 @@
     load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
                 &s[6], &s[7]);
     src += 1;
-    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                                   &s[6], &s[7]);
     x = width_hor;
 
     do {
       load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
                   &s[7], &s[8]);
       src += 8;
-      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+      transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                                     &s[7], &s[8]);
 
       // 00 10 20 30 40 50 60 70
       // 01 11 21 31 41 51 61 71
@@ -487,7 +542,8 @@
       // 50 51 52 53 54 55 xx xx
       // 60 61 62 63 64 65 xx xx
       // 70 71 72 73 74 75 xx xx
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+                                     &d[6], &d[7]);
       // store 2 extra pixels
       vst1_u8(t + 0 * stride_hor, d[0]);
       vst1_u8(t + 1 * stride_hor, d[1]);
@@ -586,15 +642,16 @@
   do {
     load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
                 &s[6], &s[7]);
-    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                                   &s[6], &s[7]);
     x = width_hor;
 
     do {
       src += 8;
       load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
                   &s[13], &s[14]);
-      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
-                       &s[14]);
+      transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11],
+                                     &s[12], &s[13], &s[14]);
 
       // 00 10 20 30 40 50 60 70
       // 01 11 21 31 41 51 61 71
@@ -619,7 +676,8 @@
       // 50 51 52 53 54 55 xx xx
       // 60 61 62 63 64 65 xx xx
       // 70 71 72 73 74 75 xx xx
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+                                     &d[6], &d[7]);
       // store 2 extra pixels
       vst1_u8(t + 0 * stride_hor, d[0]);
       vst1_u8(t + 1 * stride_hor, d[1]);
@@ -828,3 +886,293 @@
     aom_extend_frame_borders(dst, num_planes);
   }
 }
+
+static INLINE void scaledconvolve_horiz_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  y = h;
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8], d;
+          int16x8_t ss[4];
+          int16x4_t t[8], tt;
+
+          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+          transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+          t[0] = vget_low_s16(ss[0]);
+          t[1] = vget_low_s16(ss[1]);
+          t[2] = vget_low_s16(ss[2]);
+          t[3] = vget_low_s16(ss[3]);
+          t[4] = vget_high_s16(ss[0]);
+          t[5] = vget_high_s16(ss[1]);
+          t[6] = vget_high_s16(ss[2]);
+          t[7] = vget_high_s16(ss[3]);
+
+          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+                           filters);
+          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+          store_u8_4x1(&temp[4 * z], d, 0);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      {
+        const uint8x8x4_t d4 = vld4_u8(temp);
+        store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0], 0);
+        store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1], 0);
+        store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2], 0);
+        store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3], 0);
+      }
+      x += 4;
+    } while (x < w);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    y -= 4;
+  } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = (h + 7) & ~7;
+
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      uint8x8_t d[8];
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8];
+          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+                      &s[5], &s[6], &s[7]);
+          transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4],
+                                         &s[5], &s[6], &s[7]);
+          d[0] = scale_filter_8(s, filters);
+          vst1_u8(&temp[8 * z], d[0]);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                  &d[7]);
+      transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+                                     &d[6], &d[7]);
+      store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
+                   d[6], d[7]);
+      x += 8;
+    } while (x < w);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      int16x4_t t[8], tt;
+
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
+      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+      store_u8_4x1(dst, d, 0);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      d = scale_filter_8(s, filters);
+      vst1_u8(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      x = 0;
+      do {
+        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+        uint8x16_t ss[8];
+        uint8x8_t s[8], d[2];
+        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+                     &ss[5], &ss[6], &ss[7]);
+        s[0] = vget_low_u8(ss[0]);
+        s[1] = vget_low_u8(ss[1]);
+        s[2] = vget_low_u8(ss[2]);
+        s[3] = vget_low_u8(ss[3]);
+        s[4] = vget_low_u8(ss[4]);
+        s[5] = vget_low_u8(ss[5]);
+        s[6] = vget_low_u8(ss[6]);
+        s[7] = vget_low_u8(ss[7]);
+        d[0] = scale_filter_8(s, filters);
+
+        s[0] = vget_high_u8(ss[0]);
+        s[1] = vget_high_u8(ss[1]);
+        s[2] = vget_high_u8(ss[2]);
+        s[3] = vget_high_u8(ss[3]);
+        s[4] = vget_high_u8(ss[4]);
+        s[5] = vget_high_u8(ss[5]);
+        s[6] = vget_high_u8(ss[6]);
+        s[7] = vget_high_u8(ss[7]);
+        d[1] = scale_filter_8(s, filters);
+        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+        src_y += 16;
+        x += 16;
+      } while (x < w);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index d14088e..1d3a3cc 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c
@@ -418,16 +418,16 @@
       dst1_32_ptr += 2;
       dst2_ptr += 2;
       load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
-      transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+      transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4);
       load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
-      transpose_s32_4x4(&d1, &d2, &d3, &d4);
+      transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4);
       do {
         src1_ptr += 4;
         src2_ptr += 4;
         load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
-        transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+        transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8);
         load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
-        transpose_s32_4x4(&d5, &d6, &d7, &d8);
+        transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8);
         q23 = vaddl_s16(s2, s3);
         q45 = vaddl_s16(s4, s5);
         q67 = vaddl_s16(s6, s7);
@@ -438,7 +438,7 @@
         q34567 = vaddq_s32(q4567, vmovl_s16(s3));
         q45678 = vaddq_s32(q4567, vmovl_s16(s8));
 
-        transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+        transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678);
         store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
                       q45678);
         dst1_32_ptr += 4;
@@ -457,7 +457,7 @@
         r34567 = vaddq_s32(r4567, d3);
         r45678 = vaddq_s32(r4567, d8);
 
-        transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+        transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678);
         store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
         dst2_ptr += 4;
         d1 = d5;
@@ -844,9 +844,9 @@
       w = width;
 
       load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
-      transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+      transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4);
       load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
-      transpose_s32_4x4(&r1, &r2, &r3, &r4);
+      transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4);
       src1_ptr += 4;
       src2_ptr += 4;
 
@@ -861,9 +861,9 @@
 
       do {
         load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
-        transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+        transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8);
         load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
-        transpose_s32_4x4(&r5, &r6, &r7, &r8);
+        transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8);
         src1_ptr += 4;
         src2_ptr += 4;
 
@@ -873,7 +873,7 @@
         q567 = vadd_s16(d7, q56);
         q78 = vadd_s16(d7, d8);
         q678 = vadd_s16(d6, q78);
-        transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+        transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567);
         store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
         dst1_ptr += 4;
 
@@ -887,7 +887,7 @@
         r567 = vaddq_s32(r7, r56);
         r78 = vaddq_s32(r7, r8);
         r678 = vaddq_s32(r6, r78);
-        transpose_s32_4x4(&r234, &r345, &r456, &r567);
+        transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567);
         store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
         dst2_ptr += 4;
 
@@ -1449,11 +1449,11 @@
   return 0;
 }
 
-void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
-                                           int height, int stride, int eps,
-                                           const int *xqd, uint8_t *dst8,
-                                           int dst_stride, int32_t *tmpbuf,
-                                           int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+                                          int height, int stride, int eps,
+                                          const int *xqd, uint8_t *dst8,
+                                          int dst_stride, int32_t *tmpbuf,
+                                          int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1591,4 +1591,5 @@
       h--;
     } while (h > 0);
   }
+  return 0;
 }
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
index b4d3148..4723154 100644
--- a/av1/common/arm/warp_plane_neon.c
+++ b/av1/common/arm/warp_plane_neon.c
@@ -9,463 +9,259 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <assert.h>
-#include <arm_neon.h>
-#include <memory.h>
-#include <math.h>
+#include "warp_plane_neon.h"
 
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-#include "config/av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-#include "av1/common/scale.h"
-
-/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
-   * Each coefficient is stored in 8 bits instead of 16 bits
-   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
-
-     This is done in order to avoid overflow: Since the tap with the largest
-     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
-     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
-     convolve functions.
-
-     Instead, we use the summation order
-     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
-     The rearrangement of coefficients in this table is so that we can get the
-     coefficients into the correct order more quickly.
-*/
-/* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
-                filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
-#if WARPEDPIXEL_PREC_BITS == 6
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
-  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
-  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
-  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
-  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
-  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
-  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
-  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
-  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
-  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
-  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
-  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
-  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
-  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
-  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
-  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
-  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
-  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
-  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
-  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
-  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
-  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
-  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
-  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
-  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
-  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
-  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
-  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
-  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
-  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
-  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
-  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
-  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
-  // [1, 2)
-  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
-  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
-  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
-  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
-  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
-  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
-  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
-  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
-  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
-  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
-  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
-  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
-  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
-  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
-  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
-  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
-  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
-  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
-  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
-  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
-  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
-  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
-  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
-  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
-  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
-  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
-  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
-  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
-  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
-  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
-  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
-  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
-  // dummy (replicate row index 191)
-  { 0, 0,   2,  -1, 0,   0, 127, 0},
-
-#else
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
-  // [1, 2)
-  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
-  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
-  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
-  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
-  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
-  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
-  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
-  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
-  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
-  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
-  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
-  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
-  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
-  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
-  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
-  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
-  // dummy (replicate row index 95)
-  { 0, 0,   4,  -3, 0,  -1, 127, 1},
-#endif  // WARPEDPIXEL_PREC_BITS == 6
-};
-/* clang-format on */
-
-static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
-                            uint8x8_t src_1, int16x4_t *res) {
-  int16x8_t coeff_0, coeff_1;
-  int16x8_t pix_0, pix_1;
-
-  coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
-                         vreinterpret_s16_s32(x1.val[0]));
-  coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
-                         vreinterpret_s16_s32(x1.val[1]));
-
-  pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
-  pix_0 = vmulq_s16(coeff_0, pix_0);
-
-  pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
-  pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
-
-  *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
-}
-
-static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
-                                          uint8x16_t src_3, uint8x16_t src_4,
-                                          int16x8_t *tmp_dst, int sx, int alpha,
-                                          int k, const int offset_bits_horiz,
-                                          const int reduce_bits_horiz) {
-  const uint8x16_t mask = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
-  const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
-  const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
-
-  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
-  int32x2x2_t b0, b1;
-  uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
-  int32x4_t tmp_res_low, tmp_res_high;
-  uint16x8_t res;
-  int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
-
-  uint8x16_t tmp_0 = vandq_u8(src_1, mask);
-  uint8x16_t tmp_1 = vandq_u8(src_2, mask);
-  uint8x16_t tmp_2 = vandq_u8(src_3, mask);
-  uint8x16_t tmp_3 = vandq_u8(src_4, mask);
-
-  tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
-  tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
-
-  src_1 = vaddq_u8(tmp_0, tmp_2);
-  src_2 = vaddq_u8(tmp_1, tmp_3);
-
-  src_1_low = vget_low_u8(src_1);
-  src_2_low = vget_low_u8(src_2);
-  src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
-  src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
-  src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
-  src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+                                                 int alpha) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
   // Loading the 8 filter taps
-  f0 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f1 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f2 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f3 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f4 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f5 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f6 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f7 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  int16x8_t f[4];
+  load_filters_4(f, sx, alpha);
 
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
-                vreinterpret_s32_s16(vget_low_s16(f2)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
-                vreinterpret_s32_s16(vget_low_s16(f6)));
-  convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
 
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
-                vreinterpret_s32_s16(vget_low_s16(f3)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
-                vreinterpret_s32_s16(vget_low_s16(f7)));
-  convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+  int16x8_t m0 = vmulq_s16(f[0], in16_lo);
+  int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
+  int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
+  int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
 
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
-                vreinterpret_s32_s16(vget_high_s16(f2)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
-                vreinterpret_s32_s16(vget_high_s16(f6)));
-  convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+                              vpaddlq_s16(m3) };
 
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
-                vreinterpret_s32_s16(vget_high_s16(f3)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
-                vreinterpret_s32_s16(vget_high_s16(f7)));
-  convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
 
-  tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
-  tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+  uint16x8_t res =
+      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+                                                 int alpha) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  // Loading the 8 filter taps
+  int16x8_t f[8];
+  load_filters_8(f, sx, alpha);
+
+  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+  int16x8_t m0 = vmulq_s16(f[0], in16_lo);
+  int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
+  int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
+  int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
+  int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4));
+  int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5));
+  int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6));
+  int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7));
+
+  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+                              vpaddlq_s16(m3) };
+  int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
+                              vpaddlq_s16(m7) };
+
+  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+  int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
 
   tmp_res_low = vaddq_s32(tmp_res_low, add_const);
   tmp_res_high = vaddq_s32(tmp_res_high, add_const);
 
-  res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
-  res = vqrshlq_u16(res, shift);
-
-  tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+  return vreinterpretq_s16_u16(res);
 }
 
-static INLINE void vertical_filter_neon(const int16x8_t *src,
-                                        int32x4_t *res_low, int32x4_t *res_high,
-                                        int sy, int gamma) {
-  int16x4_t src_0, src_1, fltr_0, fltr_1;
-  int32x4_t res_0, res_1;
-  int32x2_t res_0_im, res_1_im;
-  int32x4_t res_even, res_odd, im_res_0, im_res_1;
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
-  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
-  int16x8x2_t b0, b1, b2, b3;
-  int32x4x2_t c0, c1, c2, c3;
-  int32x4x2_t d0, d1, d2, d3;
+  int16x8_t f_s16 =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
 
-  b0 = vtrnq_s16(src[0], src[1]);
-  b1 = vtrnq_s16(src[2], src[3]);
-  b2 = vtrnq_s16(src[4], src[5]);
-  b3 = vtrnq_s16(src[6], src[7]);
+  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
 
-  c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
-                 vreinterpretq_s32_s16(b0.val[1]));
-  c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
-                 vreinterpretq_s32_s16(b1.val[1]));
-  c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
-                 vreinterpretq_s32_s16(b2.val[1]));
-  c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
-                 vreinterpretq_s32_s16(b3.val[1]));
+  int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
+  int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
+  int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
+  int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
 
-  f0 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f1 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f2 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f3 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f4 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f5 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f6 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f7 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+                              vpaddlq_s16(m3) };
 
-  d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
-  d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
-  d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
-  d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
 
-  // row:0,1 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
 
-  // row:0,1,2,3 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+  uint16x8_t res =
+      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+  return vreinterpretq_s16_u16(res);
+}
 
-  // row:0,1 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
 
-  // row:0,1,2,3 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+  int16x8_t f_s16 =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
 
-  // row:0,1,2,3 even_col:0,2,4,6
-  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
 
-  // row:4,5 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
+  int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
+  int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
+  int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
+  int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
+  int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4));
+  int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5));
+  int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6));
+  int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7));
 
-  // row:4,5,6,7 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+                              vpaddlq_s16(m3) };
+  int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
+                              vpaddlq_s16(m7) };
 
-  // row:4,5 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
+  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+  int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
 
-  // row:4,5,6,7 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
 
-  // row:4,5,6,7 even_col:0,2,4,6
-  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+  return vreinterpretq_s16_u16(res);
+}
 
-  // row:0-7 even_col:0,2,4,6
-  res_even = vaddq_s32(im_res_0, im_res_1);
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+                                          int sy) {
+  int16x4_t s0 = vget_low_s16(src[0]);
+  int16x4_t s1 = vget_low_s16(src[1]);
+  int16x4_t s2 = vget_low_s16(src[2]);
+  int16x4_t s3 = vget_low_s16(src[3]);
+  int16x4_t s4 = vget_low_s16(src[4]);
+  int16x4_t s5 = vget_low_s16(src[5]);
+  int16x4_t s6 = vget_low_s16(src[6]);
+  int16x4_t s7 = vget_low_s16(src[7]);
 
-  // row:0,1 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
+  int16x8_t f =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
 
-  // row:0,1,2,3 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
 
-  // row:0,1 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
+  *res = m0123;
+}
 
-  // row:0,1,2,3 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+                                          int sy, int gamma) {
+  int16x8_t s0, s1, s2, s3;
+  transpose_elems_s16_4x8(
+      vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+      vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+      vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
 
-  // row:0,1,2,3 odd_col:1,3,5,7
-  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+  int16x8_t f[4];
+  load_filters_4(f, sy, gamma);
 
-  // row:4,5 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
+  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
 
-  // row:4,5,6,7 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+  int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
 
-  // row:4,5 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
+  *res = horizontal_add_4d_s32x4(m0123_pairs);
+}
 
-  // row:4,5,6,7 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy) {
+  int16x8_t s0 = src[0];
+  int16x8_t s1 = src[1];
+  int16x8_t s2 = src[2];
+  int16x8_t s3 = src[3];
+  int16x8_t s4 = src[4];
+  int16x8_t s5 = src[5];
+  int16x8_t s6 = src[6];
+  int16x8_t s7 = src[7];
 
-  // row:4,5,6,7 odd_col:1,3,5,7
-  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+  int16x8_t f =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
 
-  // row:0-7 odd_col:1,3,5,7
-  res_odd = vaddq_s32(im_res_0, im_res_1);
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
 
-  // reordering as 0 1 2 3 | 4 5 6 7
-  c0 = vtrnq_s32(res_even, res_odd);
+  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
 
-  // Final store
-  *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
-  *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+  *res_low = m0123;
+  *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy,
+                                          int gamma) {
+  int16x8_t s0 = src[0];
+  int16x8_t s1 = src[1];
+  int16x8_t s2 = src[2];
+  int16x8_t s3 = src[3];
+  int16x8_t s4 = src[4];
+  int16x8_t s5 = src[5];
+  int16x8_t s6 = src[6];
+  int16x8_t s7 = src[7];
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  int16x8_t f[8];
+  load_filters_8(f, sy, gamma);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+  int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+  m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+  int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+  m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+  int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+  m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+  int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+  m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+  int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+  int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
+
+  *res_low = horizontal_add_4d_s32x4(m0123_pairs);
+  *res_high = horizontal_add_4d_s32x4(m4567_pairs);
 }
 
 void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
@@ -474,242 +270,7 @@
                           int subsampling_x, int subsampling_y,
                           ConvolveParams *conv_params, int16_t alpha,
                           int16_t beta, int16_t gamma, int16_t delta) {
-  int16x8_t tmp[15];
-  const int bd = 8;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
-  const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
-  const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
-
-  int limit = 0;
-  uint8x16_t vec_dup, mask_val;
-  int32x4_t res_lo, res_hi;
-  int16x8_t result_final;
-  uint8x16_t src_1, src_2, src_3, src_4;
-  static const uint8_t k0To15[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
-                                      8, 9, 10, 11, 12, 13, 14, 15 };
-  uint8x16_t indx_vec = vld1q_u8(k0To15);
-  uint8x16_t cmp_vec;
-
-  const int reduce_bits_horiz = conv_params->round_0;
-  const int reduce_bits_vert = conv_params->is_compound
-                                   ? conv_params->round_1
-                                   : 2 * FILTER_BITS - reduce_bits_horiz;
-  const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int16x4_t res_sub_const =
-      vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
-                   (1 << (offset_bits - conv_params->round_1 - 1))));
-  int k;
-
-  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
-
-  for (int i = 0; i < p_height; i += 8) {
-    for (int j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int64_t dst_x =
-          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
-      const int64_t dst_y =
-          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
-      const int64_t x4 = dst_x >> subsampling_x;
-      const int64_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      // horizontal
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int16_t dup_val =
-              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
-
-          tmp[k + 7] = vdupq_n_s16(dup_val);
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-                            ref[iy * stride + (width - 1)] *
-                                (1 << (FILTER_BITS - reduce_bits_horiz));
-          tmp[k + 7] = vdupq_n_s16(dup_val);
-        }
-      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
-        const int out_of_boundary_left = -(ix4 - 6);
-        const int out_of_boundary_right = (ix4 + 8) - width;
-
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          const uint8_t *src = ref + iy * stride + ix4 - 7;
-          src_1 = vld1q_u8(src);
-
-          if (out_of_boundary_left >= 0) {
-            limit = out_of_boundary_left + 1;
-            cmp_vec = vdupq_n_u8(out_of_boundary_left);
-            vec_dup = vdupq_n_u8(*(src + limit));
-            mask_val = vcleq_u8(indx_vec, cmp_vec);
-            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
-          }
-          if (out_of_boundary_right >= 0) {
-            limit = 15 - (out_of_boundary_right + 1);
-            cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
-            vec_dup = vdupq_n_u8(*(src + limit));
-            mask_val = vcgeq_u8(indx_vec, cmp_vec);
-            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
-          }
-          src_2 = vextq_u8(src_1, src_1, 1);
-          src_3 = vextq_u8(src_2, src_2, 1);
-          src_4 = vextq_u8(src_3, src_3, 1);
-
-          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
-                                 offset_bits_horiz, reduce_bits_horiz);
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          const uint8_t *src = ref + iy * stride + ix4 - 7;
-          src_1 = vld1q_u8(src);
-          src_2 = vextq_u8(src_1, src_1, 1);
-          src_3 = vextq_u8(src_2, src_2, 1);
-          src_4 = vextq_u8(src_3, src_3, 1);
-
-          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
-                                 offset_bits_horiz, reduce_bits_horiz);
-        }
-      }
-
-      // vertical
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        const int16x8_t *v_src = tmp + (k + 4);
-
-        vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
-
-        res_lo = vaddq_s32(res_lo, add_const_vert);
-        res_hi = vaddq_s32(res_hi, add_const_vert);
-
-        if (conv_params->is_compound) {
-          uint16_t *const p =
-              (uint16_t *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-
-          res_lo = vrshlq_s32(res_lo, shift_vert);
-          if (conv_params->do_average) {
-            uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
-            uint16x4_t tmp16_lo = vld1_u16(p);
-            int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
-            int16x4_t tmp16_low;
-            if (conv_params->use_dist_wtd_comp_avg) {
-              res_lo = vmulq_s32(res_lo, bwd);
-              tmp32_lo = vmulq_s32(tmp32_lo, fwd);
-              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
-              tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
-            } else {
-              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
-              tmp16_low = vshrn_n_s32(tmp32_lo, 1);
-            }
-            int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
-            res_low = vqrshl_s16(res_low, round_bits_vec);
-            int16x8_t final_res_low = vcombine_s16(res_low, res_low);
-            uint8x8_t res_8_low = vqmovun_s16(final_res_low);
-
-            vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
-          } else {
-            uint16x4_t res_u16_low = vqmovun_s32(res_lo);
-            vst1_u16(p, res_u16_low);
-          }
-          if (p_width > 4) {
-            uint16_t *const p4 =
-                (uint16_t *)&conv_params
-                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
-            res_hi = vrshlq_s32(res_hi, shift_vert);
-            if (conv_params->do_average) {
-              uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
-
-              uint16x4_t tmp16_hi = vld1_u16(p4);
-              int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
-              int16x4_t tmp16_high;
-              if (conv_params->use_dist_wtd_comp_avg) {
-                res_hi = vmulq_s32(res_hi, bwd);
-                tmp32_hi = vmulq_s32(tmp32_hi, fwd);
-                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
-                tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
-              } else {
-                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
-                tmp16_high = vshrn_n_s32(tmp32_hi, 1);
-              }
-              int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
-              res_high = vqrshl_s16(res_high, round_bits_vec);
-              int16x8_t final_res_high = vcombine_s16(res_high, res_high);
-              uint8x8_t res_8_high = vqmovun_s16(final_res_high);
-
-              vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
-                            0);
-            } else {
-              uint16x4_t res_u16_high = vqmovun_s32(res_hi);
-              vst1_u16(p4, res_u16_high);
-            }
-          }
-        } else {
-          res_lo = vrshlq_s32(res_lo, shift_vert);
-          res_hi = vrshlq_s32(res_hi, shift_vert);
-
-          result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
-          result_final = vsubq_s16(result_final, sub_constant);
-
-          uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
-          uint8x8_t val = vqmovun_s16(result_final);
-
-          if (p_width == 4) {
-            vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
-          } else {
-            vst1_u8(p, val);
-          }
-        }
-      }
-    }
-  }
+  av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, conv_params, alpha, beta, gamma, delta);
 }
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h
new file mode 100644
index 0000000..de5e3bd
--- /dev/null
+++ b/av1/common/arm/warp_plane_neon.h
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
+#define AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+                                                 int alpha);
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+                                                 int alpha);
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx);
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx);
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+                                          int sy);
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+                                          int sy, int gamma);
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy);
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy,
+                                          int gamma);
+
+static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) {
+  out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+}
+
+static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) {
+  out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+  out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >>
+                                                      WARPEDDIFF_PREC_BITS)));
+}
+
+static INLINE int clamp_iy(int iy, int height) {
+  return clamp(iy, 0, height - 1);
+}
+
+static INLINE void warp_affine_horizontal(
+    const uint8_t *ref, int width, int height, int stride, int p_width,
+    int p_height, int16_t alpha, int16_t beta, const int64_t x4,
+    const int64_t y4, const int i, int16x8_t tmp[], const uint8x16_t indx_vec) {
+  const int bd = 8;
+  const int reduce_bits_horiz = ROUND0_BITS;
+  const int height_limit = AOMMIN(8, p_height - i) + 7;
+
+  int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+  int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+
+  int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+  sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+  if (ix4 <= -7) {
+    for (int k = 0; k < height_limit; ++k) {
+      int iy = clamp_iy(iy4 + k - 7, height);
+      int16_t dup_val =
+          (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+          ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+      tmp[k] = vdupq_n_s16(dup_val);
+    }
+    return;
+  } else if (ix4 >= width + 6) {
+    for (int k = 0; k < height_limit; ++k) {
+      int iy = clamp_iy(iy4 + k - 7, height);
+      int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                        ref[iy * stride + (width - 1)] *
+                            (1 << (FILTER_BITS - reduce_bits_horiz));
+      tmp[k] = vdupq_n_s16(dup_val);
+    }
+    return;
+  }
+
+  uint8x16_t in[15];
+  if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+    const int out_of_boundary_left = -(ix4 - 6);
+    const int out_of_boundary_right = (ix4 + 8) - width;
+
+    for (int k = 0; k < height_limit; ++k) {
+      const int iy = clamp_iy(iy4 + k - 7, height);
+      const uint8_t *src = ref + iy * stride + ix4 - 7;
+      uint8x16_t src_1 = vld1q_u8(src);
+
+      if (out_of_boundary_left >= 0) {
+        int limit = out_of_boundary_left + 1;
+        uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left);
+        uint8x16_t vec_dup = vdupq_n_u8(*(src + limit));
+        uint8x16_t mask_val = vcleq_u8(indx_vec, cmp_vec);
+        src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+      }
+      if (out_of_boundary_right >= 0) {
+        int limit = 15 - (out_of_boundary_right + 1);
+        uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+        uint8x16_t vec_dup = vdupq_n_u8(*(src + limit));
+        uint8x16_t mask_val = vcgeq_u8(indx_vec, cmp_vec);
+        src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+      }
+      in[k] = src_1;
+    }
+  } else {
+    for (int k = 0; k < height_limit; ++k) {
+      const int iy = clamp_iy(iy4 + k - 7, height);
+      const uint8_t *src = ref + iy * stride + ix4 - 7;
+      in[k] = vld1q_u8(src);
+    }
+  }
+
+  if (p_width == 4) {
+    if (beta == 0) {
+      if (alpha == 0) {
+        for (int k = 0; k < height_limit; ++k) {
+          tmp[k] = horizontal_filter_4x1_f1(in[k], sx4);
+        }
+      } else {
+        for (int k = 0; k < height_limit; ++k) {
+          tmp[k] = horizontal_filter_4x1_f4(in[k], sx4, alpha);
+        }
+      }
+    } else {
+      if (alpha == 0) {
+        for (int k = 0; k < height_limit; ++k) {
+          const int sx = sx4 + beta * (k - 3);
+          tmp[k] = horizontal_filter_4x1_f1(in[k], sx);
+        }
+      } else {
+        for (int k = 0; k < height_limit; ++k) {
+          const int sx = sx4 + beta * (k - 3);
+          tmp[k] = horizontal_filter_4x1_f4(in[k], sx, alpha);
+        }
+      }
+    }
+  } else {
+    if (beta == 0) {
+      if (alpha == 0) {
+        for (int k = 0; k < height_limit; ++k) {
+          tmp[k] = horizontal_filter_8x1_f1(in[k], sx4);
+        }
+      } else {
+        for (int k = 0; k < height_limit; ++k) {
+          tmp[k] = horizontal_filter_8x1_f8(in[k], sx4, alpha);
+        }
+      }
+    } else {
+      if (alpha == 0) {
+        for (int k = 0; k < height_limit; ++k) {
+          const int sx = sx4 + beta * (k - 3);
+          tmp[k] = horizontal_filter_8x1_f1(in[k], sx);
+        }
+      } else {
+        for (int k = 0; k < height_limit; ++k) {
+          const int sx = sx4 + beta * (k - 3);
+          tmp[k] = horizontal_filter_8x1_f8(in[k], sx, alpha);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void warp_affine_vertical(
+    uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound,
+    uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg,
+    int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j,
+    int16x8_t tmp[], const int fwd, const int bwd) {
+  const int bd = 8;
+  const int reduce_bits_horiz = ROUND0_BITS;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  int add_const_vert;
+  if (is_compound) {
+    add_const_vert =
+        (1 << offset_bits_vert) + (1 << (COMPOUND_ROUND1_BITS - 1));
+  } else {
+    add_const_vert =
+        (1 << offset_bits_vert) + (1 << (2 * FILTER_BITS - ROUND0_BITS - 1));
+  }
+  const int sub_constant = (1 << (bd - 1)) + (1 << bd);
+
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int res_sub_const =
+      (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS)) -
+      (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
+  int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+  sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+  if (p_width > 4) {
+    for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+      int sy = sy4 + delta * (k + 4);
+      const int16x8_t *v_src = tmp + (k + 4);
+
+      int32x4_t res_lo, res_hi;
+      if (gamma == 0) {
+        vertical_filter_8x1_f1(v_src, &res_lo, &res_hi, sy);
+      } else {
+        vertical_filter_8x1_f8(v_src, &res_lo, &res_hi, sy, gamma);
+      }
+
+      res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert));
+      res_hi = vaddq_s32(res_hi, vdupq_n_s32(add_const_vert));
+
+      if (is_compound) {
+        uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j];
+        int16x8_t res_s16 =
+            vcombine_s16(vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS),
+                         vshrn_n_s32(res_hi, COMPOUND_ROUND1_BITS));
+        if (do_average) {
+          int16x8_t tmp16 = vreinterpretq_s16_u16(vld1q_u16(p));
+          if (use_dist_wtd_comp_avg) {
+            int32x4_t tmp32_lo = vmull_n_s16(vget_low_s16(tmp16), fwd);
+            int32x4_t tmp32_hi = vmull_n_s16(vget_high_s16(tmp16), fwd);
+            tmp32_lo = vmlal_n_s16(tmp32_lo, vget_low_s16(res_s16), bwd);
+            tmp32_hi = vmlal_n_s16(tmp32_hi, vget_high_s16(res_s16), bwd);
+            tmp16 = vcombine_s16(vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS),
+                                 vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS));
+          } else {
+            tmp16 = vhaddq_s16(tmp16, res_s16);
+          }
+          int16x8_t res = vaddq_s16(tmp16, vdupq_n_s16(res_sub_const));
+          uint8x8_t res8 = vqshrun_n_s16(
+              res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+          vst1_u8(&pred[(i + k + 4) * p_stride + j], res8);
+        } else {
+          vst1q_u16(p, vreinterpretq_u16_s16(res_s16));
+        }
+      } else {
+        int16x8_t res16 =
+            vcombine_s16(vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS),
+                         vshrn_n_s32(res_hi, 2 * FILTER_BITS - ROUND0_BITS));
+        res16 = vsubq_s16(res16, vdupq_n_s16(sub_constant));
+
+        uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+        vst1_u8(p, vqmovun_s16(res16));
+      }
+    }
+  } else {
+    // p_width == 4
+    for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+      int sy = sy4 + delta * (k + 4);
+      const int16x8_t *v_src = tmp + (k + 4);
+
+      int32x4_t res_lo;
+      if (gamma == 0) {
+        vertical_filter_4x1_f1(v_src, &res_lo, sy);
+      } else {
+        vertical_filter_4x1_f4(v_src, &res_lo, sy, gamma);
+      }
+
+      res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert));
+
+      if (is_compound) {
+        uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j];
+
+        int16x4_t res_lo_s16 = vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS);
+        if (do_average) {
+          uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+          int16x4_t tmp16_lo = vreinterpret_s16_u16(vld1_u16(p));
+          if (use_dist_wtd_comp_avg) {
+            int32x4_t tmp32_lo = vmull_n_s16(tmp16_lo, fwd);
+            tmp32_lo = vmlal_n_s16(tmp32_lo, res_lo_s16, bwd);
+            tmp16_lo = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+          } else {
+            tmp16_lo = vhadd_s16(tmp16_lo, res_lo_s16);
+          }
+          int16x4_t res = vadd_s16(tmp16_lo, vdup_n_s16(res_sub_const));
+          uint8x8_t res8 = vqshrun_n_s16(
+              vcombine_s16(res, vdup_n_s16(0)),
+              2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+          vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res8), 0);
+        } else {
+          uint16x4_t res_u16_low = vreinterpret_u16_s16(res_lo_s16);
+          vst1_u16(p, res_u16_low);
+        }
+      } else {
+        int16x4_t res16 = vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS);
+        res16 = vsub_s16(res16, vdup_n_s16(sub_constant));
+
+        uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+        uint8x8_t val = vqmovun_s16(vcombine_s16(res16, vdup_n_s16(0)));
+        vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+      }
+    }
+  }
+}
+
+static INLINE void av1_warp_affine_common(
+    const int32_t *mat, const uint8_t *ref, int width, int height, int stride,
+    uint8_t *pred, int p_col, int p_row, int p_width, int p_height,
+    int p_stride, int subsampling_x, int subsampling_y,
+    ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
+    int16_t delta) {
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const int is_compound = conv_params->is_compound;
+  uint16_t *const dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  static const uint8_t k0To15[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                      8, 9, 10, 11, 12, 13, 14, 15 };
+  const uint8x16_t indx_vec = vld1q_u8(k0To15);
+
+  assert(IMPLIES(is_compound, dst != NULL));
+  assert(IMPLIES(do_average, is_compound));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int16x8_t tmp[15];
+      warp_affine_horizontal(ref, width, height, stride, p_width, p_height,
+                             alpha, beta, x4, y4, i, tmp, indx_vec);
+      warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
+                           dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
+                           delta, y4, i, j, tmp, w0, w1);
+    }
+  }
+}
+
+#endif  // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c
new file mode 100644
index 0000000..39e3ad9
--- /dev/null
+++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "warp_plane_neon.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+                                                 int alpha) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  // Loading the 8 filter taps
+  int16x8_t f[4];
+  load_filters_4(f, sx, alpha);
+
+  int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+  int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+
+  uint8x8_t in0 = vget_low_u8(in);
+  uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+  uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+  uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+
+  int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+  int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+
+  int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+  uint16x8_t res =
+      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+                                                 int alpha) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  // Loading the 8 filter taps
+  int16x8_t f[8];
+  load_filters_8(f, sx, alpha);
+
+  int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+  int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+  int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5]));
+  int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7]));
+
+  uint8x8_t in0 = vget_low_u8(in);
+  uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+  uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+  uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+  uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4));
+  uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5));
+  uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6));
+  uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7));
+
+  int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+  int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+  int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8);
+  int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8);
+
+  int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+  int32x4_t tmp_res_high = vpaddq_s32(m45, m67);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  int16x8_t f_s16 =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+  uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+  uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+  uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+
+  int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+  m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+  int32x4_t tmp_res_low = m0123;
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+  uint16x8_t res =
+      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  int16x8_t f_s16 =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+  uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+  uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+  uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]);
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+  uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+  uint8x16_t in_89ab = vqtbl1q_u8(in, perm2);
+
+  int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+  m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+  int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0);
+  m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1);
+
+  int32x4_t tmp_res_low = m0123;
+  int32x4_t tmp_res_high = m4567;
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+                                          int sy) {
+  int16x4_t s0 = vget_low_s16(src[0]);
+  int16x4_t s1 = vget_low_s16(src[1]);
+  int16x4_t s2 = vget_low_s16(src[2]);
+  int16x4_t s3 = vget_low_s16(src[3]);
+  int16x4_t s4 = vget_low_s16(src[4]);
+  int16x4_t s5 = vget_low_s16(src[5]);
+  int16x4_t s6 = vget_low_s16(src[6]);
+  int16x4_t s7 = vget_low_s16(src[7]);
+
+  int16x8_t f =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+  *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+                                          int sy, int gamma) {
+  int16x8_t s0, s1, s2, s3;
+  transpose_elems_s16_4x8(
+      vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+      vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+      vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+  int16x8_t f[4];
+  load_filters_4(f, sy, gamma);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+  int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+
+  *res = horizontal_add_4d_s32x4(m0123_pairs);
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy) {
+  int16x8_t s0 = src[0];
+  int16x8_t s1 = src[1];
+  int16x8_t s2 = src[2];
+  int16x8_t s3 = src[3];
+  int16x8_t s4 = src[4];
+  int16x8_t s5 = src[5];
+  int16x8_t s6 = src[6];
+  int16x8_t s7 = src[7];
+
+  int16x8_t f =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+  *res_low = m0123;
+  *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy,
+                                          int gamma) {
+  int16x8_t s0 = src[0];
+  int16x8_t s1 = src[1];
+  int16x8_t s2 = src[2];
+  int16x8_t s3 = src[3];
+  int16x8_t s4 = src[4];
+  int16x8_t s5 = src[5];
+  int16x8_t s6 = src[6];
+  int16x8_t s7 = src[7];
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  int16x8_t f[8];
+  load_filters_8(f, sy, gamma);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+  int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+  m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+  int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+  m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+  int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+  m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+  int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+  m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+  int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+  int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
+
+  *res_low = horizontal_add_4d_s32x4(m0123_pairs);
+  *res_high = horizontal_add_4d_s32x4(m4567_pairs);
+}
+
+void av1_warp_affine_neon_i8mm(const int32_t *mat, const uint8_t *ref,
+                               int width, int height, int stride, uint8_t *pred,
+                               int p_col, int p_row, int p_width, int p_height,
+                               int p_stride, int subsampling_x,
+                               int subsampling_y, ConvolveParams *conv_params,
+                               int16_t alpha, int16_t beta, int16_t gamma,
+                               int16_t delta) {
+  av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
new file mode 100644
index 0000000..2a48c5e
--- /dev/null
+++ b/av1/common/arm/warp_plane_sve.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "warp_plane_neon.h"
+
+#include <arm_neon_sve_bridge.h>
+
+DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+  // The 16-bit dot product instructions only exist in SVE and not Neon.
+  // We can get away without rewriting the existing Neon code by making use of
+  // the Neon-SVE bridge intrinsics to reinterpret a Neon vector as a SVE
+  // vector with the high part of the vector being "don't care", and then
+  // operating on that instead.
+  // This is clearly suboptimal in machines with a SVE vector length above
+  // 128-bits as the remainder of the vector is wasted, however this appears to
+  // still be beneficial compared to not using the instruction.
+  return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                   svset_neonq_s16(svundef_s16(), x),
+                                   svset_neonq_s16(svundef_s16(), y)));
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+                                                 int alpha) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  // Loading the 8 filter taps
+  int16x8_t f[4];
+  load_filters_4(f, sx, alpha);
+
+  int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+  int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+
+  uint8x8_t in0 = vget_low_u8(in);
+  uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+  uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+  uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+
+  int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+  int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+
+  int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+  uint16x8_t res =
+      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+                                                 int alpha) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  // Loading the 8 filter taps
+  int16x8_t f[8];
+  load_filters_8(f, sx, alpha);
+
+  int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+  int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+  int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5]));
+  int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7]));
+
+  uint8x8_t in0 = vget_low_u8(in);
+  uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+  uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+  uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+  uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4));
+  uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5));
+  uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6));
+  uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7));
+
+  int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+  int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+  int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8);
+  int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8);
+
+  int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+  int32x4_t tmp_res_high = vpaddq_s32(m45, m67);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  int16x8_t f_s16 =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+  uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+  uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+  uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+
+  int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+  m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+  int32x4_t tmp_res_low = m0123;
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+  uint16x8_t res =
+      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+  int16x8_t f_s16 =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+  uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+  uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+  uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]);
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+  uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+  uint8x16_t in_89ab = vqtbl1q_u8(in, perm2);
+
+  int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+  m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+  int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0);
+  m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1);
+
+  int32x4_t tmp_res_low = m0123;
+  int32x4_t tmp_res_high = m4567;
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+  return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+                                          int sy) {
+  int16x4_t s0 = vget_low_s16(src[0]);
+  int16x4_t s1 = vget_low_s16(src[1]);
+  int16x4_t s2 = vget_low_s16(src[2]);
+  int16x4_t s3 = vget_low_s16(src[3]);
+  int16x4_t s4 = vget_low_s16(src[4]);
+  int16x4_t s5 = vget_low_s16(src[5]);
+  int16x4_t s6 = vget_low_s16(src[6]);
+  int16x4_t s7 = vget_low_s16(src[7]);
+
+  int16x8_t f =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+  *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+                                          int sy, int gamma) {
+  int16x8_t s0, s1, s2, s3;
+  transpose_elems_s16_4x8(
+      vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+      vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+      vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+  int16x8_t f[4];
+  load_filters_4(f, sy, gamma);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+
+  *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy) {
+  int16x8_t s0 = src[0];
+  int16x8_t s1 = src[1];
+  int16x8_t s2 = src[2];
+  int16x8_t s3 = src[3];
+  int16x8_t s4 = src[4];
+  int16x8_t s5 = src[5];
+  int16x8_t s6 = src[6];
+  int16x8_t s7 = src[7];
+
+  int16x8_t f =
+      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+  *res_low = m0123;
+  *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+                                          int32x4_t *res_low,
+                                          int32x4_t *res_high, int sy,
+                                          int gamma) {
+  int16x8_t s0 = src[0];
+  int16x8_t s1 = src[1];
+  int16x8_t s2 = src[2];
+  int16x8_t s3 = src[3];
+  int16x8_t s4 = src[4];
+  int16x8_t s5 = src[5];
+  int16x8_t s6 = src[6];
+  int16x8_t s7 = src[7];
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  int16x8_t f[8];
+  load_filters_8(f, sy, gamma);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  *res_low = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  *res_high = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+}
+
+void av1_warp_affine_sve(const int32_t *mat, const uint8_t *ref, int width,
+                         int height, int stride, uint8_t *pred, int p_col,
+                         int p_row, int p_width, int p_height, int p_stride,
+                         int subsampling_x, int subsampling_y,
+                         ConvolveParams *conv_params, int16_t alpha,
+                         int16_t beta, int16_t gamma, int16_t delta) {
+  av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index d7f511d..6440c16 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -15,318 +15,334 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
-#include "aom_dsp/txfm_common.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
 #include "aom_ports/mem.h"
 #include "av1/common/common.h"
-#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/restoration.h"
 
-#define HORZ_FILTERING_CORE(t0, t1, t2, t3, t4, t5, t6, res)                 \
-  res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t1));                            \
-  res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t3));                            \
-  res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t5));                            \
-  res3 = vreinterpretq_s16_u16(vmovl_u8(t6));                                \
-  res = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, bd, \
-                                   conv_params->round_0);
+static INLINE uint16x8_t wiener_convolve5_8_2d_h(
+    const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
+    const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter,
+    const int32x4_t round_vec, const uint16x8_t im_max_val) {
+  // Since the Wiener filter is symmetric about the middle tap (tap 2) add
+  // mirrored source elements before multiplying filter coefficients.
+  int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4));
+  int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3));
+  int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
 
-#define PROCESS_ROW_FOR_VERTICAL_FILTER                                      \
-  __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);                          \
-                                                                             \
-  do {                                                                       \
-    s7 = vld1q_s16(s);                                                       \
-    s += src_stride;                                                         \
-                                                                             \
-    t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, \
-                                   bd, conv_params->round_1);                \
-    vst1_u8(d, t0);                                                          \
-    d += dst_stride;                                                         \
-                                                                             \
-    s0 = s1;                                                                 \
-    s1 = s2;                                                                 \
-    s2 = s3;                                                                 \
-    s3 = s4;                                                                 \
-    s4 = s5;                                                                 \
-    s5 = s6;                                                                 \
-    s6 = s7;                                                                 \
-    height--;                                                                \
-  } while (height > 0);
+  // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.)
+  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3);
 
-static INLINE void process_row_for_horz_filtering(
-    uint16_t *dst_ptr, int16_t *filter_x, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, ptrdiff_t dst_stride, int round0_bits, int w,
-    int height, int bd) {
+  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
+                                vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
+
+  return vminq_u16(res, im_max_val);
+}
+
+static INLINE void convolve_add_src_horiz_5tap_neon(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+    const int32x4_t round_vec, const uint16x8_t im_max_val) {
   do {
-    __builtin_prefetch(src_ptr);
-
-    uint8x8_t tt0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-
-    __builtin_prefetch(dst_ptr);
-
-    const uint8_t *ss = src_ptr + 8;
-    uint16_t *d_tmp = dst_ptr;
+    const uint8_t *s = src_ptr;
+    uint16_t *d = dst_ptr;
     int width = w;
 
     do {
-      uint8x8_t tt7 = vld1_u8(ss);  // a8 a9 a10 a11 a12 a13 a14 a15
-      uint8x8_t ttemp_0 = tt0;
-      tt0 = tt7;
+      uint8x8_t s0, s1, s2, s3, s4;
+      load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4);
 
-      uint8x8_t tt1 = vext_u8(ttemp_0, tt7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-      uint8x8_t tt2 = vext_u8(ttemp_0, tt7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-      uint8x8_t tt3 = vext_u8(ttemp_0, tt7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-      uint8x8_t tt4 = vext_u8(ttemp_0, tt7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-      uint8x8_t tt5 = vext_u8(ttemp_0, tt7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-      uint8x8_t tt6 = vext_u8(ttemp_0, tt7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-      tt7 = vext_u8(ttemp_0, tt7, 7);            // a7 a8 a9 a10 a11 a12 a13 a14
+      uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter,
+                                              round_vec, im_max_val);
 
-      int16x8_t ttt0 = vreinterpretq_s16_u16(vaddl_u8(ttemp_0, tt6));
-      int16x8_t ttt1 = vreinterpretq_s16_u16(vaddl_u8(tt1, tt5));
-      int16x8_t ttt2 = vreinterpretq_s16_u16(vaddl_u8(tt2, tt4));
-      int16x8_t ttt3 = vreinterpretq_s16_u16(vmovl_u8(tt3));
-      uint16x8_t dd0 = wiener_convolve8_horiz_8x8(ttt0, ttt1, ttt2, ttt3,
-                                                  filter_x, bd, round0_bits);
+      vst1q_u16(d, d0);
 
-      vst1q_u16(d_tmp, dd0);
-
-      ss += 8;
-      d_tmp += 8;
+      s += 8;
+      d += 8;
       width -= 8;
-    } while (width > 0);
-
+    } while (width != 0);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    height--;
-  } while (height > 0);
+  } while (--h != 0);
 }
 
-/* Wiener filter 2D
-   Apply horizontal filter and store in a temporary buffer. When applying
-   vertical filter, overwrite the original pixel values.
-*/
+static INLINE uint16x8_t wiener_convolve7_8_2d_h(
+    const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
+    const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5,
+    const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec,
+    const uint16x8_t im_max_val) {
+  // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+  // mirrored source elements before multiplying by filter coefficients.
+  int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+  int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+  int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+  int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);
+
+  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
+                                vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
+
+  return vminq_u16(res, im_max_val);
+}
+
+static INLINE void convolve_add_src_horiz_7tap_neon(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+    const int32x4_t round_vec, const uint16x8_t im_max_val) {
+  do {
+    const uint8_t *s = src_ptr;
+    uint16_t *d = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6,
+                                              x_filter, round_vec, im_max_val);
+
+      vst1q_u16(d, d0);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  } while (--h != 0);
+}
+
+static INLINE uint8x8_t wiener_convolve5_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,
+    const int32x4_t round_vec) {
+  // Since the Wiener filter is symmetric about the middle tap (tap 2) add
+  // mirrored source elements before multiplying by filter coefficients.
+  int16x8_t s04 = vaddq_s16(s0, s4);
+  int16x8_t s13 = vaddq_s16(s1, s3);
+
+  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3);
+
+  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3);
+
+  int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+  int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res_lo, res_hi));
+}
+
+static INLINE void convolve_add_src_vert_5tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+    const int32x4_t round_vec) {
+  do {
+    const int16_t *s = (int16_t *)src;
+    uint8_t *d = dst;
+    int height = h;
+
+    while (height > 3) {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+      uint8x8_t d0 =
+          wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
+      uint8x8_t d1 =
+          wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec);
+      uint8x8_t d2 =
+          wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec);
+      uint8x8_t d3 =
+          wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    }
+
+    while (height-- != 0) {
+      int16x8_t s0, s1, s2, s3, s4;
+      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+
+      uint8x8_t d0 =
+          wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
+
+      vst1_u8(d, d0);
+
+      d += dst_stride;
+      s += src_stride;
+    }
+
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static INLINE uint8x8_t wiener_convolve7_8_2d_v(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) {
+  // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+  // mirrored source elements before multiplying by filter coefficients.
+  int16x8_t s06 = vaddq_s16(s0, s6);
+  int16x8_t s15 = vaddq_s16(s1, s5);
+  int16x8_t s24 = vaddq_s16(s2, s4);
+
+  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3);
+
+  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3);
+
+  int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+  int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+
+  return vqmovun_s16(vcombine_s16(res_lo, res_hi));
+}
+
+static INLINE void convolve_add_src_vert_7tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+    const int32x4_t round_vec) {
+  do {
+    const int16_t *s = (int16_t *)src;
+    uint8_t *d = dst;
+    int height = h;
+
+    while (height > 3) {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
+      load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                    &s9);
+
+      uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
+                                             y_filter, round_vec);
+      uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7,
+                                             y_filter, round_vec);
+      uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8,
+                                             y_filter, round_vec);
+      uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9,
+                                             y_filter, round_vec);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    }
+
+    while (height-- != 0) {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
+                                             y_filter, round_vec);
+
+      vst1_u8(d, d0);
+
+      d += dst_stride;
+      s += src_stride;
+    }
+
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+  assert(filter[7] == 0);
+  if (filter[0] == 0 && filter[6] == 0) {
+    return WIENER_WIN_REDUCED;
+  }
+  return WIENER_WIN;
+}
+
+// Wiener filter 2D
+// Apply horizontal filter and store in a temporary buffer. When applying
+// vertical filter, overwrite the original pixel values.
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
+                                      const int16_t *x_filter, int x_step_q4,
+                                      const int16_t *y_filter, int y_step_q4,
                                       int w, int h,
-                                      const ConvolveParams *conv_params) {
-  uint8_t *d;
-  const uint8_t *src_ptr, *s_tmp;
-  uint16_t *dst_ptr;
+                                      const WienerConvolveParams *conv_params) {
   (void)x_step_q4;
   (void)y_step_q4;
+  (void)conv_params;
 
-  int height;
-  const int bd = 8;
-  // Indicates the height needs to be processed during horizontal filtering.
-  const int intermediate_height = h + SUBPEL_TAPS - 1;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  int16_t filter_x_tmp[7], filter_y_tmp[7];
+  assert(w % 8 == 0);
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(x_filter[7] == 0 && y_filter[7] == 0);
+  // For bd == 8, assert horizontal filtering output will not exceed 15-bit:
+  assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15);
 
   DECLARE_ALIGNED(16, uint16_t,
-                  temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+                  im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
 
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w % 8));
+  const int x_filter_taps = get_wiener_filter_taps(x_filter);
+  const int y_filter_taps = get_wiener_filter_taps(y_filter);
+  int16x4_t x_filter_s16 = vld1_s16(x_filter);
+  int16x4_t y_filter_s16 = vld1_s16(y_filter);
+  // Add 128 to tap 3. (Needed for rounding.)
+  x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+  y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
 
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
+  const int im_stride = MAX_SB_SIZE;
+  const int im_h = h + y_filter_taps - 1;
+  const int horiz_offset = x_filter_taps / 2;
+  const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
 
-  assert(filter_x[7] == 0);
-  assert(filter_y[7] == 0);
+  const int bd = 8;
+  const uint16x8_t im_max_val =
+      vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1);
+  const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
 
-  /* assumption of horizontal filtering output will not exceed 15 bit.
-     ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
-     16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
-   */
-  assert((conv_params->round_0) >= 1);
+  const int32x4_t vert_round_vec =
+      vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) -
+                  (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1)));
 
-  memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
-  memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
-
-  filter_x_tmp[3] += (1 << FILTER_BITS);
-  filter_y_tmp[3] += (1 << FILTER_BITS);
-
-  s_tmp = src - center_tap * src_stride - center_tap;
-  dst_ptr = temp;
-  src_ptr = s_tmp;
-  height = intermediate_height;
-
-  // For aarch_64.
-#if AOM_ARCH_AARCH64
-  int processed_height = 0;
-  uint16_t *d_tmp;
-  int width, remaining_height;
-  // Start of horizontal filtering.
-  if (intermediate_height > 7) {
-    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-    do {
-      const uint8_t *s;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
-      do {
-        int16x8_t res0, res1, res2, res3;
-        uint8x8_t t8, t9, t10, t11, t12, t13, t14;
-        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
-        transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
-
-        HORZ_FILTERING_CORE(t0, t6, t1, t5, t2, t4, t3, res4)
-        HORZ_FILTERING_CORE(t1, t7, t2, t6, t3, t5, t4, res5)
-        HORZ_FILTERING_CORE(t2, t8, t3, t7, t4, t6, t5, res6)
-        HORZ_FILTERING_CORE(t3, t9, t4, t8, t5, t7, t6, res7)
-        HORZ_FILTERING_CORE(t4, t10, t5, t9, t6, t8, t7, res8)
-        HORZ_FILTERING_CORE(t5, t11, t6, t10, t7, t9, t8, res9)
-        HORZ_FILTERING_CORE(t6, t12, t7, t11, t8, t10, t9, res10)
-        HORZ_FILTERING_CORE(t7, t13, t8, t12, t9, t11, t10, res11)
-
-        transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
-                          &res11);
-        store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
-                      res10, res11);
-
-        t0 = t8;
-        t1 = t9;
-        t2 = t10;
-        t3 = t11;
-        t4 = t12;
-        t5 = t13;
-        t6 = t14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * MAX_SB_SIZE;
-      height -= 8;
-      processed_height += 8;
-    } while (height > 7);
+  if (x_filter_taps == WIENER_WIN_REDUCED) {
+    convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset,
+                                     src_stride, im_block, im_stride, w, im_h,
+                                     x_filter_s16, horiz_round_vec, im_max_val);
+  } else {
+    convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset,
+                                     src_stride, im_block, im_stride, w, im_h,
+                                     x_filter_s16, horiz_round_vec, im_max_val);
   }
 
-  // Process the remaining rows for horizontal filtering.
-  remaining_height = intermediate_height - processed_height;
-  if (remaining_height)
-    process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
-                                   MAX_SB_SIZE, conv_params->round_0, w, height,
-                                   bd);
-
-  // Start of vertical filtering.
-  {
-    int16_t *src_tmp_ptr, *s;
-    uint8_t *dst_tmp_ptr;
-    height = h;
-    width = w;
-    src_tmp_ptr = (int16_t *)temp;
-    dst_tmp_ptr = dst;
-    src_stride = MAX_SB_SIZE;
-
-    do {
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-      uint8x8_t t0;
-      s = src_tmp_ptr;
-      d = dst_tmp_ptr;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      height = h;
-
-      do {
-        int16x8_t s8, s9, s10;
-        uint8x8_t t1, t2, t3;
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
-
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
-                                       bd, conv_params->round_1);
-        t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
-                                       bd, conv_params->round_1);
-        t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
-                                       bd, conv_params->round_1);
-        t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
-                                       bd, conv_params->round_1);
-
-        store_u8_8x4(d, dst_stride, t0, t1, t2, t3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 3);
-
-      if (height) {
-        PROCESS_ROW_FOR_VERTICAL_FILTER
-      }
-      src_tmp_ptr += 8;
-      dst_tmp_ptr += 8;
-      w -= 8;
-    } while (w > 0);
+  if (y_filter_taps == WIENER_WIN_REDUCED) {
+    convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter_s16, vert_round_vec);
+  } else {
+    convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+                                    y_filter_s16, vert_round_vec);
   }
-#else
-  // Start of horizontal filtering.
-  process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
-                                 MAX_SB_SIZE, conv_params->round_0, w, height,
-                                 bd);
-
-  // Start of vertical filtering.
-  {
-    int16_t *src_tmp_ptr, *s;
-    uint8_t *dst_tmp_ptr;
-    src_tmp_ptr = (int16_t *)temp;
-    dst_tmp_ptr = dst;
-    src_stride = MAX_SB_SIZE;
-
-    do {
-      uint8x8_t t0;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-      s = src_tmp_ptr;
-      d = dst_tmp_ptr;
-
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      height = h;
-      PROCESS_ROW_FOR_VERTICAL_FILTER
-
-      src_tmp_ptr += 8;
-      dst_tmp_ptr += 8;
-
-      w -= 8;
-    } while (w > 0);
-  }
-#endif
 }
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 78443c7..c9880cf 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -14,6 +14,8 @@
 
 #include "config/aom_config.h"
 
+#include "aom/internal/aom_codec_internal.h"
+
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/seg_common.h"
@@ -87,6 +89,7 @@
 
   AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
   TX_SIZE tx_buf[MAX_MIB_SIZE];
+  struct aom_internal_error_info error_info;
 } LFWorkerData;
 /*!\endcond */
 
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 17dcc49..38e1da9 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -94,11 +94,11 @@
   add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
   specialize qw/av1_highbd_convolve_horiz_rs sse4_1 neon/;
 
-  add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
-  specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
+  add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd";
+  specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2 neon/;
 }
 
-add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params";
 specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
 
 # directional intra predictor functions
@@ -255,11 +255,11 @@
 
 # build compound seg mask functions
 add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
+specialize qw/av1_build_compound_diffwtd_mask neon sse4_1 avx2/;
 
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
-  specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+  specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2 neon/;
 }
 
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
@@ -296,24 +296,24 @@
                                                        const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
                                                        int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
                                                        int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
+  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3 neon/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                    const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
                                                    int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-    specialize qw/aom_highbd_upsampled_pred sse2/;
+    specialize qw/aom_highbd_upsampled_pred sse2 neon/;
 
     add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                             const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
                                                             int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2 neon/;
 
     add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                                 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
                                                                 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
                                                                 int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
+    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2 neon/;
   }
 
   # the transform coefficients are held in 32-bit
@@ -396,15 +396,16 @@
   #
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
+    specialize qw/av1_apply_temporal_filter sse2 avx2 neon neon_dotprod/;
 
     add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
-    specialize qw/av1_estimate_noise_from_single_plane avx2/;
+    specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-      specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
+      specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
 
       add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
+      specialize qw/av1_highbd_estimate_noise_from_single_plane neon/;
     }
   }
 
@@ -419,7 +420,7 @@
   # ENCODEMB INVOKE
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-    specialize qw/av1_highbd_block_error sse2 avx2/;
+    specialize qw/av1_highbd_block_error sse2 avx2 neon/;
   }
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -438,9 +439,9 @@
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon/;
   add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
-  specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
+  specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon/;
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
-  specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
+  specialize qw/av1_wedge_compute_delta_squares sse2 avx2 neon/;
 
   # hash
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
@@ -448,43 +449,43 @@
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
-    specialize qw/av1_compute_stats sse4_1 avx2/;
-    add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
-    specialize qw/av1_calc_proj_params sse4_1 avx2/;
-    add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+    specialize qw/av1_compute_stats sse4_1 avx2 neon/;
+    add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+    specialize qw/av1_calc_proj_params sse4_1 avx2 neon/;
+    add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
     specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
 
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void av1_calc_proj_params_high_bd/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
-      specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2/;
-      add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+      add_proto qw/void av1_calc_proj_params_high_bd/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+      specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/;
+      add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
       specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
       add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
-      specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+      specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
     }
   }
 
-  add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+  add_proto qw/void av1_get_horver_correlation_full/, "const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
 
-  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+  add_proto qw/void av1_nn_predict/, "const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
 
-  add_proto qw/void av1_nn_fast_softmax_16/, " const float *input_nodes, float *output";
+  add_proto qw/void av1_nn_fast_softmax_16/, "const float *input_nodes, float *output";
   if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
-    specialize qw/av1_nn_predict sse3 neon/;
+    specialize qw/av1_nn_predict sse3 avx2 neon/;
     specialize qw/av1_nn_fast_softmax_16 sse3/;
   }
 
   # CNN functions
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
-    add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
-    add_proto qw/bool av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
-    add_proto qw/void av1_cnn_convolve_no_maxpool_padding_valid/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step";
+    add_proto qw/void av1_cnn_activate/, "float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+    add_proto qw/void av1_cnn_add/, "float **input, int channels, int width, int height, int stride, const float **add";
+    add_proto qw/bool av1_cnn_predict/, "const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+    add_proto qw/void av1_cnn_convolve_no_maxpool_padding_valid/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step";
     if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
       specialize qw/av1_cnn_convolve_no_maxpool_padding_valid avx2/;
     }
-    add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+    add_proto qw/void av1_cnn_deconvolve/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
     add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
   }
 
@@ -540,17 +541,14 @@
 # WARPED_MOTION / GLOBAL_MOTION functions
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_highbd_warp_affine sse4_1 avx2/;
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/;
 }
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 avx2 neon/;
-
-add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
-specialize qw/av1_calc_frame_error sse2 avx2/;
+specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
 
 # LOOP_RESTORATION functions
-add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+add_proto qw/int av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
 specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
@@ -561,16 +559,22 @@
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
 add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
 add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
 add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
+add_proto qw/void av1_convolve_y_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
 add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
 add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params";
 add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
 add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params";
 if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_x_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
+  add_proto qw/void av1_highbd_convolve_y_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
   add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
@@ -580,13 +584,16 @@
 
   add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
 
-  specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
-  specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+  specialize qw/av1_convolve_2d_sr_intrabc neon/;
+  specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+  specialize qw/av1_convolve_x_sr_intrabc neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr_intrabc neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
-  specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
-  specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
   specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
   if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/;
@@ -594,21 +601,26 @@
     specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/;
     specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/;
     specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon/;
+    specialize qw/av1_highbd_convolve_2d_sr_intrabc neon/;
     specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon/;
+    specialize qw/av1_highbd_convolve_x_sr_intrabc neon/;
     specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon/;
+    specialize qw/av1_highbd_convolve_y_sr_intrabc neon/;
     specialize qw/av1_highbd_convolve_2d_scale sse4_1 neon/;
   }
 
 # INTRA_EDGE functions
 add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
-specialize qw/av1_filter_intra_edge sse4_1/;
+specialize qw/av1_filter_intra_edge sse4_1 neon/;
 add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
-specialize qw/av1_upsample_intra_edge sse4_1/;
+specialize qw/av1_upsample_intra_edge sse4_1 neon/;
 
-add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
-specialize qw/av1_filter_intra_edge_high sse4_1/;
-add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
-specialize qw/av1_upsample_intra_edge_high sse4_1/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_filter_intra_edge/, "uint16_t *p, int sz, int strength";
+  specialize qw/av1_highbd_filter_intra_edge sse4_1 neon/;
+  add_proto qw/void av1_highbd_upsample_intra_edge/, "uint16_t *p, int sz, int bd";
+  specialize qw/av1_highbd_upsample_intra_edge sse4_1 neon/;
+}
 
 # CFL
 add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
diff --git a/av1/common/av1_txfm.c b/av1/common/av1_txfm.c
index ac43402..011403b 100644
--- a/av1/common/av1_txfm.c
+++ b/av1/common/av1_txfm.c
@@ -15,7 +15,7 @@
 #include "av1/common/av1_txfm.h"
 
 // av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
-const int32_t av1_cospi_arr_data[7][64] = {
+const int32_t av1_cospi_arr_data[4][64] = {
   { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
     972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
     822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
@@ -38,36 +38,153 @@
     7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
     5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
     3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
-    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
-  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
-    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
-    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
-    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
-    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
-    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
-  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
-    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
-    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
-    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
-    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
-    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
-  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
-    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
-    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
-    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
-    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
-    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 }
 };
 
 // av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
 // << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
-const int32_t av1_sinpi_arr_data[7][5] = {
-  { 0, 330, 621, 836, 951 },        { 0, 660, 1241, 1672, 1901 },
-  { 0, 1321, 2482, 3344, 3803 },    { 0, 2642, 4964, 6689, 7606 },
-  { 0, 5283, 9929, 13377, 15212 },  { 0, 10566, 19858, 26755, 30424 },
-  { 0, 21133, 39716, 53510, 60849 }
+const int32_t av1_sinpi_arr_data[4][5] = { { 0, 330, 621, 836, 951 },
+                                           { 0, 660, 1241, 1672, 1901 },
+                                           { 0, 1321, 2482, 3344, 3803 },
+                                           { 0, 2642, 4964, 6689, 7606 } };
+
+// The reduced bit-width arrays are only used in the Arm Neon implementations
+// in av1_fwd_txfm2d_neon.c for now.
+#if HAVE_NEON
+// Constants are stored in groups of four, where symmetrical constants in the
+// cospi array are stored adjacent in memory, followed immediately by the same
+// constants but negated, i.e.:
+//   f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i))) << (3-i)
+// and then in memory we store 4-tuples of constants together as:
+//   f4(i,j) = [ f(i,j), f(i,64-j), -f(i,j), -f(i,64-j) ]
+//
+// Constants are stored in Q2.13 format, see:
+// https://en.wikipedia.org/wiki/Q_(number_format)
+//
+// The order of the constants is such that increasing subdivisions of 64 store
+// f4 tuples contiguously:
+// av1_cospi_arr_q13_data[i] = {
+//   f4(i,32),  // f(i,32) twice
+//   f4(i,16),  // f(i,16) and f(i,48), f4(i,32) skipped since present above.
+//   f4(i,8), f(i,24), // f4(i,16) and f4(i,32) skipped since present above.
+//   f4(i,4), f(i,12), f4(i,20), f4(i,28),
+//   f4(i,2), f4(i,6), f4(i,10), f4(i,14), f4(i,18), ...
+//   f4(i,1), f4(i,3), f4(i,5), f4(i,7), f4(i,9), f4(i,11), ...
+// }
+const int16_t av1_cospi_arr_q13_data[4][128] = {
+  {
+      5792,  5792,  -5792, -5792, 7568,  3136,  -7568, -3136, 8032,  1600,
+      -8032, -1600, 6808,  4552,  -6808, -4552, 8152,  800,   -8152, -800,
+      7840,  2376,  -7840, -2376, 7224,  3864,  -7224, -3864, 6336,  5200,
+      -6336, -5200, 8184,  400,   -8184, -400,  8104,  1200,  -8104, -1200,
+      7944,  1992,  -7944, -1992, 7712,  2760,  -7712, -2760, 7408,  3504,
+      -7408, -3504, 7024,  4208,  -7024, -4208, 6576,  4880,  -6576, -4880,
+      6072,  5504,  -6072, -5504, 8192,  200,   -8192, -200,  8168,  600,
+      -8168, -600,  8128,  1000,  -8128, -1000, 8072,  1400,  -8072, -1400,
+      7992,  1792,  -7992, -1792, 7896,  2184,  -7896, -2184, 7776,  2568,
+      -7776, -2568, 7640,  2952,  -7640, -2952, 7488,  3320,  -7488, -3320,
+      7320,  3680,  -7320, -3680, 7128,  4040,  -7128, -4040, 6920,  4384,
+      -6920, -4384, 6696,  4720,  -6696, -4720, 6456,  5040,  -6456, -5040,
+      6200,  5352,  -6200, -5352, 5936,  5648,  -5936, -5648,
+  },
+  {
+      5792,  5792,  -5792, -5792, 7568,  3136,  -7568, -3136, 8036,  1600,
+      -8036, -1600, 6812,  4552,  -6812, -4552, 8152,  804,   -8152, -804,
+      7840,  2380,  -7840, -2380, 7224,  3860,  -7224, -3860, 6332,  5196,
+      -6332, -5196, 8184,  400,   -8184, -400,  8104,  1204,  -8104, -1204,
+      7948,  1992,  -7948, -1992, 7712,  2760,  -7712, -2760, 7404,  3504,
+      -7404, -3504, 7028,  4212,  -7028, -4212, 6580,  4880,  -6580, -4880,
+      6068,  5500,  -6068, -5500, 8188,  200,   -8188, -200,  8168,  604,
+      -8168, -604,  8132,  1004,  -8132, -1004, 8072,  1400,  -8072, -1400,
+      7992,  1796,  -7992, -1796, 7896,  2184,  -7896, -2184, 7780,  2568,
+      -7780, -2568, 7644,  2948,  -7644, -2948, 7488,  3320,  -7488, -3320,
+      7316,  3684,  -7316, -3684, 7128,  4036,  -7128, -4036, 6920,  4384,
+      -6920, -4384, 6696,  4716,  -6696, -4716, 6460,  5040,  -6460, -5040,
+      6204,  5352,  -6204, -5352, 5932,  5648,  -5932, -5648,
+  },
+  {
+      5792,  5792,  -5792, -5792, 7568,  3134,  -7568, -3134, 8034,  1598,
+      -8034, -1598, 6812,  4552,  -6812, -4552, 8152,  802,   -8152, -802,
+      7840,  2378,  -7840, -2378, 7224,  3862,  -7224, -3862, 6332,  5196,
+      -6332, -5196, 8182,  402,   -8182, -402,  8104,  1202,  -8104, -1202,
+      7946,  1990,  -7946, -1990, 7714,  2760,  -7714, -2760, 7406,  3502,
+      -7406, -3502, 7026,  4212,  -7026, -4212, 6580,  4880,  -6580, -4880,
+      6070,  5502,  -6070, -5502, 8190,  202,   -8190, -202,  8170,  602,
+      -8170, -602,  8130,  1002,  -8130, -1002, 8072,  1400,  -8072, -1400,
+      7992,  1794,  -7992, -1794, 7896,  2184,  -7896, -2184, 7778,  2570,
+      -7778, -2570, 7644,  2948,  -7644, -2948, 7490,  3320,  -7490, -3320,
+      7318,  3684,  -7318, -3684, 7128,  4038,  -7128, -4038, 6922,  4382,
+      -6922, -4382, 6698,  4718,  -6698, -4718, 6458,  5040,  -6458, -5040,
+      6204,  5350,  -6204, -5350, 5934,  5648,  -5934, -5648,
+  },
+  {
+      5793,  5793,  -5793, -5793, 7568,  3135,  -7568, -3135, 8035,  1598,
+      -8035, -1598, 6811,  4551,  -6811, -4551, 8153,  803,   -8153, -803,
+      7839,  2378,  -7839, -2378, 7225,  3862,  -7225, -3862, 6333,  5197,
+      -6333, -5197, 8182,  402,   -8182, -402,  8103,  1202,  -8103, -1202,
+      7946,  1990,  -7946, -1990, 7713,  2760,  -7713, -2760, 7405,  3503,
+      -7405, -3503, 7027,  4212,  -7027, -4212, 6580,  4880,  -6580, -4880,
+      6070,  5501,  -6070, -5501, 8190,  201,   -8190, -201,  8170,  603,
+      -8170, -603,  8130,  1003,  -8130, -1003, 8071,  1401,  -8071, -1401,
+      7993,  1795,  -7993, -1795, 7895,  2185,  -7895, -2185, 7779,  2570,
+      -7779, -2570, 7643,  2948,  -7643, -2948, 7489,  3320,  -7489, -3320,
+      7317,  3683,  -7317, -3683, 7128,  4038,  -7128, -4038, 6921,  4383,
+      -6921, -4383, 6698,  4717,  -6698, -4717, 6458,  5040,  -6458, -5040,
+      6203,  5351,  -6203, -5351, 5933,  5649,  -5933, -5649,
+  }
 };
 
+// av1_sinpi_arr_q13_data[i][j] =
+//   round((sqrt2 * sin((j+1)*Pi/9) * 2/3) * (1 << (cos_bit_min + i))) << (3-i)
+// modified so that elements j=0,1 sum to element j=3.
+// See also: https://en.wikipedia.org/wiki/Q_(number_format)
+const int16_t av1_sinpi_arr_q13_data[4][4] = { { 2640, 4968, 6688, 7608 },
+                                               { 2640, 4964, 6688, 7604 },
+                                               { 2642, 4964, 6688, 7606 },
+                                               { 2642, 4964, 6689, 7606 } };
+
+// Constants are stored in pairs, where symmetrical constants in the
+// cospi array are stored adjacent in memory, i.e.:
+//   f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)))
+// and then in memory we store 4-tuples of constants together as:
+//   f2(i,j) = [ f(i,j), f(i,64-j) ]
+const int32_t av1_cospi_arr_s32_data[4][66] = {
+  {
+      1024, 0,    1024, 25,   1023, 50,   1021, 75,  1019, 100, 1016,
+      125,  1013, 150,  1009, 175,  1004, 200,  999, 224,  993, 249,
+      987,  273,  980,  297,  972,  321,  964,  345, 955,  369, 946,
+      392,  936,  415,  926,  438,  915,  460,  903, 483,  891, 505,
+      878,  526,  865,  548,  851,  569,  837,  590, 822,  610, 807,
+      630,  792,  650,  775,  669,  759,  688,  742, 706,  724, 724,
+  },
+  {
+      2048, 0,    2047, 50,   2046, 100,  2042, 151,  2038, 201,  2033,
+      251,  2026, 301,  2018, 350,  2009, 400,  1998, 449,  1987, 498,
+      1974, 546,  1960, 595,  1945, 642,  1928, 690,  1911, 737,  1892,
+      784,  1872, 830,  1851, 876,  1829, 921,  1806, 965,  1782, 1009,
+      1757, 1053, 1730, 1096, 1703, 1138, 1674, 1179, 1645, 1220, 1615,
+      1260, 1583, 1299, 1551, 1338, 1517, 1375, 1483, 1412, 1448, 1448,
+  },
+  {
+      4096, 0,    4095, 101,  4091, 201,  4085, 301,  4076, 401,  4065,
+      501,  4052, 601,  4036, 700,  4017, 799,  3996, 897,  3973, 995,
+      3948, 1092, 3920, 1189, 3889, 1285, 3857, 1380, 3822, 1474, 3784,
+      1567, 3745, 1660, 3703, 1751, 3659, 1842, 3612, 1931, 3564, 2019,
+      3513, 2106, 3461, 2191, 3406, 2276, 3349, 2359, 3290, 2440, 3229,
+      2520, 3166, 2598, 3102, 2675, 3035, 2751, 2967, 2824, 2896, 2896,
+  },
+  {
+      8192, 0,    8190, 201,  8182, 402,  8170, 603,  8153, 803,  8130,
+      1003, 8103, 1202, 8071, 1401, 8035, 1598, 7993, 1795, 7946, 1990,
+      7895, 2185, 7839, 2378, 7779, 2570, 7713, 2760, 7643, 2948, 7568,
+      3135, 7489, 3320, 7405, 3503, 7317, 3683, 7225, 3862, 7128, 4038,
+      7027, 4212, 6921, 4383, 6811, 4551, 6698, 4717, 6580, 4880, 6458,
+      5040, 6333, 5197, 6203, 5351, 6070, 5501, 5933, 5649, 5793, 5793,
+  }
+};
+
+#endif  // HAVE_NEON
+
 void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
   int i;
   if (bit == 0) {
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index be1164f..7ad70af 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -31,13 +31,12 @@
 #define DO_RANGE_CHECK_CLAMP 0
 #endif
 
-extern const int32_t av1_cospi_arr_data[7][64];
-extern const int32_t av1_sinpi_arr_data[7][5];
+extern const int32_t av1_cospi_arr_data[4][64];
+extern const int32_t av1_sinpi_arr_data[4][5];
 
 #define MAX_TXFM_STAGE_NUM 12
 
 static const int cos_bit_min = 10;
-static const int cos_bit_max = 16;
 
 #define NewSqrt2Bits ((int32_t)12)
 // 2^12 * sqrt(2)
@@ -53,6 +52,29 @@
   return av1_sinpi_arr_data[n - cos_bit_min];
 }
 
+// The reduced bit-width and permuted arrays are only used in the Arm Neon
+// implementations in av1_fwd_txfm2d_neon.c and highbd_fwd_txfm_neon.c for now.
+#if HAVE_NEON
+// Store cospi/sinpi costants in Q2.13 format.
+// See: https://en.wikipedia.org/wiki/Q_(number_format)
+extern const int16_t av1_cospi_arr_q13_data[4][128];
+extern const int16_t av1_sinpi_arr_q13_data[4][4];
+
+extern const int32_t av1_cospi_arr_s32_data[4][66];
+
+static INLINE const int16_t *cospi_arr_q13(int n) {
+  return av1_cospi_arr_q13_data[n - cos_bit_min];
+}
+
+static INLINE const int16_t *sinpi_arr_q13(int n) {
+  return av1_sinpi_arr_q13_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *cospi_arr_s32(int n) {
+  return av1_cospi_arr_s32_data[n - cos_bit_min];
+}
+#endif  // HAVE_NEON
+
 static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   const int64_t max_value = (1LL << (bit - 1)) - 1;
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 202f9d6..12e9545 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -20,6 +20,7 @@
 #include "av1/common/cdef.h"
 #include "av1/common/cdef_block.h"
 #include "av1/common/reconinter.h"
+#include "av1/common/thread_common.h"
 
 static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
@@ -413,12 +414,25 @@
                      uint16_t **const linebuf, uint16_t **const colbuf,
                      uint16_t *const src, int fbr,
                      cdef_init_fb_row_t cdef_init_fb_row_fn,
-                     struct AV1CdefSyncData *const cdef_sync) {
+                     struct AV1CdefSyncData *const cdef_sync,
+                     struct aom_internal_error_info *error_info) {
+  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+  // in future to handle error propagation.
+  (void)error_info;
   CdefBlockInfo fb_info;
   int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 };
   const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
 
   cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
+#if CONFIG_MULTITHREAD
+  if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) {
+    pthread_mutex_lock(cdef_sync->mutex_);
+    const bool cdef_mt_exit = cdef_sync->cdef_mt_exit;
+    pthread_mutex_unlock(cdef_sync->mutex_);
+    // Exit in case any worker has encountered an error.
+    if (cdef_mt_exit) return;
+  }
+#endif
   for (int fbc = 0; fbc < nhfb; fbc++) {
     fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
     if (fbc != nhfb - 1)
@@ -447,5 +461,6 @@
 
   for (int fbr = 0; fbr < nvfb; fbr++)
     av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
-                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL);
+                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL,
+                    xd->error_info);
 }
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index e166f4b..a56cd9d 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -98,7 +98,8 @@
                      uint16_t **const linebuf, uint16_t **const colbuf,
                      uint16_t *const src, int fbr,
                      cdef_init_fb_row_t cdef_init_fb_row_fn,
-                     struct AV1CdefSyncData *const cdef_sync);
+                     struct AV1CdefSyncData *const cdef_sync,
+                     struct aom_internal_error_info *error_info);
 void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
                           const MACROBLOCKD *const xd,
                           CdefBlockInfo *const fb_info,
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 455a896..b5e4f12 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -47,9 +47,6 @@
                                        int coeff_shift, int block_width,
                                        int block_height);
 
-void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                              cdef_list *dlist, int cdef_count, int bsize);
-
 void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
                         const uint16_t *in, int xdec, int ydec,
                         int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 6d4221e..0e37d45 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -184,8 +184,8 @@
   cfl->are_parameters_computed = 1;
 }
 
-void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       TX_SIZE tx_size, int plane) {
+void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                           TX_SIZE tx_size, int plane) {
   CFL_CTX *const cfl = &xd->cfl;
   MB_MODE_INFO *mbmi = xd->mi[0];
   assert(is_cfl_allowed(xd));
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index af8b833..dcaa87b 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -72,8 +72,8 @@
   cfl->dc_pred_is_cached[CFL_PRED_V] = false;
 }
 
-void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       TX_SIZE tx_size, int plane);
+void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                           TX_SIZE tx_size, int plane);
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 9bca542..bb72e0c 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -787,6 +787,120 @@
   }
 }
 
+// This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
+// optimized version for intrabc. Use the following 2-tap filter:
+// DECLARE_ALIGNED(256, static const int16_t,
+//                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+//   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+//   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// };
+void av1_highbd_convolve_2d_sr_intrabc_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+  assert(subpel_x_qn == 8);
+  assert(subpel_y_qn == 8);
+  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + 1;
+  int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+
+  // horizontal filter
+  // explicitly operate for subpel_x_qn = 8.
+  int16_t *im = im_block;
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+      im[x] = sum;
+    }
+    src += src_stride;
+    im += im_stride;
+  }
+
+  // vertical filter
+  // explicitly operate for subpel_y_qn = 8.
+  int16_t *src_vert = im_block;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      const int32_t sum =
+          (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                          ((1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1)));
+
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+    }
+    src_vert += im_stride;
+    dst += dst_stride;
+  }
+}
+
+// This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
+// optimized version for intrabc.
+void av1_highbd_convolve_y_sr_intrabc_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    int bd) {
+  assert(subpel_y_qn == 8);
+  assert(filter_params_y->taps == 2);
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  // vertical filter
+  // explicitly operate for subpel_y_qn = 8.
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      const int32_t res = src[x] + src[src_stride + x];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
+// optimized version for intrabc.
+void av1_highbd_convolve_x_sr_intrabc_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
+  const int bits = FILTER_BITS - conv_params->round_0;
+  assert(bits >= 0);
+  assert(subpel_x_qn == 8);
+  assert(filter_params_x->taps == 2);
+  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  // horizontal filter
+  // explicitly operate for subpel_x_qn = 8.
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 64 * (src[x] + src[x + 1]);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
 void av1_highbd_dist_wtd_convolve_2d_c(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
@@ -1139,14 +1253,31 @@
   (void)dst_stride;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
-  const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
-  const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
-  const InterpFilterParams *filter_params_x =
-      need_filter_params_x ? interp_filters[0] : NULL;
-  const InterpFilterParams *filter_params_y =
-      need_filter_params_y ? interp_filters[1] : NULL;
+  const InterpFilterParams *filter_params_x = interp_filters[0];
+  const InterpFilterParams *filter_params_y = interp_filters[1];
 
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  // 2-tap filter indicates that it is for IntraBC.
+  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+    assert(!scaled);
+    if (subpel_x_qn && subpel_y_qn) {
+      av1_highbd_convolve_2d_sr_intrabc_c(
+          src, src_stride, dst, dst_stride, w, h, filter_params_x,
+          filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+      return;
+    } else if (subpel_x_qn) {
+      av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+                                         filter_params_x, subpel_x_qn,
+                                         conv_params, bd);
+      return;
+    } else if (subpel_y_qn) {
+      av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+                                         filter_params_y, subpel_y_qn, bd);
+      return;
+    }
+  }
+
   if (scaled) {
     if (conv_params->is_compound) {
       assert(conv_params->dst != NULL);
@@ -1269,7 +1400,7 @@
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h,
-                                   const ConvolveParams *conv_params) {
+                                   const WienerConvolveParams *conv_params) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
@@ -1349,7 +1480,7 @@
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
     const int16_t *filter_y, int y_step_q4, int w, int h,
-    const ConvolveParams *conv_params, int bd) {
+    const WienerConvolveParams *conv_params, int bd) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 36c0c84..d6dd876 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -31,6 +31,11 @@
   int bck_offset;
 } ConvolveParams;
 
+typedef struct WienerConvolveParams {
+  int round_0;
+  int round_1;
+} WienerConvolveParams;
+
 #define ROUND0_BITS 3
 #define COMPOUND_ROUND1_BITS 7
 #define WIENER_ROUND0_BITS 3
@@ -99,11 +104,8 @@
   return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
 }
 
-static INLINE ConvolveParams get_conv_params_wiener(int bd) {
-  ConvolveParams conv_params;
-  (void)bd;
-  conv_params.do_average = 0;
-  conv_params.is_compound = 0;
+static INLINE WienerConvolveParams get_conv_params_wiener(int bd) {
+  WienerConvolveParams conv_params;
   conv_params.round_0 = WIENER_ROUND0_BITS;
   conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
   const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
@@ -112,9 +114,6 @@
     conv_params.round_0 += intbufrange - 16;
     conv_params.round_1 -= intbufrange - 16;
   }
-  conv_params.dst = NULL;
-  conv_params.dst_stride = 0;
-  conv_params.plane = 0;
   return conv_params;
 }
 
diff --git a/av1/common/enums.h b/av1/common/enums.h
index fb4d756..b99a138 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -27,8 +27,6 @@
 
 /*!\cond */
 
-#undef MAX_SB_SIZE
-
 // Max superblock size
 #define MAX_SB_SIZE_LOG2 7
 #define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
@@ -199,7 +197,7 @@
 #define TX_PAD_END 16
 #define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
 
-// Number of maxium size transform blocks in the maximum size superblock
+// Number of maximum size transform blocks in the maximum size superblock
 #define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
 #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
 
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 3704b8a..67fb13f 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1048,41 +1048,6 @@
   p_left[-1] = s;
 }
 
-void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
-  if (!strength) return;
-
-  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
-                                                         { 0, 5, 6, 5, 0 },
-                                                         { 2, 4, 4, 4, 2 } };
-  const int filt = strength - 1;
-  uint16_t edge[129];
-
-  memcpy(edge, p, sz * sizeof(*p));
-  for (int i = 1; i < sz; i++) {
-    int s = 0;
-    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
-      int k = i - 2 + j;
-      k = (k < 0) ? 0 : k;
-      k = (k > sz - 1) ? sz - 1 : k;
-      s += edge[k] * kernel[filt][j];
-    }
-    s = (s + 8) >> 4;
-    p[i] = s;
-  }
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
-  const int kernel[3] = { 5, 6, 5 };
-
-  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
-          (p_above[0] * kernel[2]);
-  s = (s + 8) >> 4;
-  p_above[-1] = s;
-  p_left[-1] = s;
-}
-#endif
-
 void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
   // interpolate half-sample positions
   assert(sz <= MAX_UPSAMPLE_SZ);
@@ -1106,206 +1071,6 @@
   }
 }
 
-void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
-  // interpolate half-sample positions
-  assert(sz <= MAX_UPSAMPLE_SZ);
-
-  uint16_t in[MAX_UPSAMPLE_SZ + 3];
-  // copy p[-1..(sz-1)] and extend first and last samples
-  in[0] = p[-1];
-  in[1] = p[-1];
-  for (int i = 0; i < sz; i++) {
-    in[i + 2] = p[i];
-  }
-  in[sz + 2] = p[sz - 1];
-
-  // interpolate half-sample edge positions
-  p[-2] = in[0];
-  for (int i = 0; i < sz; i++) {
-    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
-    s = (s + 8) >> 4;
-    s = clip_pixel_highbd(s, bd);
-    p[2 * i - 1] = s;
-    p[2 * i] = in[i + 2];
-  }
-}
-#if CONFIG_AV1_HIGHBITDEPTH
-static void build_intra_predictors_high(
-    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
-    PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
-    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
-    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
-    int bit_depth) {
-  int i;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
-  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
-  uint16_t *const above_row = above_data + 16;
-  uint16_t *const left_col = left_data + 16;
-  const int txwpx = tx_size_wide[tx_size];
-  const int txhpx = tx_size_high[tx_size];
-  int need_left = extend_modes[mode] & NEED_LEFT;
-  int need_above = extend_modes[mode] & NEED_ABOVE;
-  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
-  const uint16_t *above_ref = ref - ref_stride;
-  const uint16_t *left_ref = ref - 1;
-  const int is_dr_mode = av1_is_directional_mode(mode);
-  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  int base = 128 << (bit_depth - 8);
-  // The left_data, above_data buffers must be zeroed to fix some intermittent
-  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
-  // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
-  // seen to be the potential reason for this issue.
-  aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
-  aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
-
-  // The default values if ref pixels are not available:
-  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
-  // base+1   A      B  ..     Y      Z
-  // base+1   C      D  ..     W      X
-  // base+1   E      F  ..     U      V
-  // base+1   G      H  ..     S      T      T      T      T      T
-
-  if (is_dr_mode) {
-    if (p_angle <= 90)
-      need_above = 1, need_left = 0, need_above_left = 1;
-    else if (p_angle < 180)
-      need_above = 1, need_left = 1, need_above_left = 1;
-    else
-      need_above = 0, need_left = 1, need_above_left = 1;
-  }
-  if (use_filter_intra) need_left = need_above = need_above_left = 1;
-
-  assert(n_top_px >= 0);
-  assert(n_topright_px >= -1);
-  assert(n_left_px >= 0);
-  assert(n_bottomleft_px >= -1);
-
-  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
-    int val;
-    if (need_left) {
-      val = (n_top_px > 0) ? above_ref[0] : base + 1;
-    } else {
-      val = (n_left_px > 0) ? left_ref[0] : base - 1;
-    }
-    for (i = 0; i < txhpx; ++i) {
-      aom_memset16(dst, val, txwpx);
-      dst += dst_stride;
-    }
-    return;
-  }
-
-  // NEED_LEFT
-  if (need_left) {
-    const int num_left_pixels_needed =
-        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
-    i = 0;
-    if (n_left_px > 0) {
-      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
-      if (n_bottomleft_px > 0) {
-        assert(i == txhpx);
-        for (; i < txhpx + n_bottomleft_px; i++)
-          left_col[i] = left_ref[i * ref_stride];
-      }
-      if (i < num_left_pixels_needed)
-        aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
-    } else if (n_top_px > 0) {
-      aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
-    }
-  }
-
-  // NEED_ABOVE
-  if (need_above) {
-    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
-    if (n_top_px > 0) {
-      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
-      i = n_top_px;
-      if (n_topright_px > 0) {
-        assert(n_top_px == txwpx);
-        memcpy(above_row + txwpx, above_ref + txwpx,
-               n_topright_px * sizeof(above_ref[0]));
-        i += n_topright_px;
-      }
-      if (i < num_top_pixels_needed)
-        aom_memset16(&above_row[i], above_row[i - 1],
-                     num_top_pixels_needed - i);
-    } else if (n_left_px > 0) {
-      aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
-    }
-  }
-
-  if (need_above_left) {
-    if (n_top_px > 0 && n_left_px > 0) {
-      above_row[-1] = above_ref[-1];
-    } else if (n_top_px > 0) {
-      above_row[-1] = above_ref[0];
-    } else if (n_left_px > 0) {
-      above_row[-1] = left_ref[0];
-    } else {
-      above_row[-1] = base;
-    }
-    left_col[-1] = above_row[-1];
-  }
-
-  if (use_filter_intra) {
-    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                                  filter_intra_mode, bit_depth);
-    return;
-  }
-
-  if (is_dr_mode) {
-    int upsample_above = 0;
-    int upsample_left = 0;
-    if (!disable_edge_filter) {
-      const int need_right = p_angle < 90;
-      const int need_bottom = p_angle > 180;
-      if (p_angle != 90 && p_angle != 180) {
-        const int ab_le = need_above_left ? 1 : 0;
-        if (need_above && need_left && (txwpx + txhpx >= 24)) {
-          filter_intra_edge_corner_high(above_row, left_col);
-        }
-        if (need_above && n_top_px > 0) {
-          const int strength = intra_edge_filter_strength(
-              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
-          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
-          av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
-        }
-        if (need_left && n_left_px > 0) {
-          const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
-          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
-          av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
-        }
-      }
-      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
-                                                   intra_edge_filter_type);
-      if (need_above && upsample_above) {
-        const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_upsample_intra_edge_high(above_row, n_px, bit_depth);
-      }
-      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
-                                                  intra_edge_filter_type);
-      if (need_left && upsample_left) {
-        const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_upsample_intra_edge_high(left_col, n_px, bit_depth);
-      }
-    }
-    highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                        upsample_above, upsample_left, p_angle, bit_depth);
-    return;
-  }
-
-  // predict
-  if (mode == DC_PRED) {
-    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
-        dst, dst_stride, above_row, left_col, bit_depth);
-  } else {
-    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
 static void build_intra_predictors(
     const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
     PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
@@ -1476,6 +1241,241 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
+  const int filt = strength - 1;
+  uint16_t edge[129];
+
+  memcpy(edge, p, sz * sizeof(*p));
+  for (int i = 1; i < sz; i++) {
+    int s = 0;
+    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+      int k = i - 2 + j;
+      k = (k < 0) ? 0 : k;
+      k = (k > sz - 1) ? sz - 1 : k;
+      s += edge[k] * kernel[filt][j];
+    }
+    s = (s + 8) >> 4;
+    p[i] = s;
+  }
+}
+
+static void highbd_filter_intra_edge_corner(uint16_t *p_above,
+                                            uint16_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) {
+  // interpolate half-sample positions
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint16_t in[MAX_UPSAMPLE_SZ + 3];
+  // copy p[-1..(sz-1)] and extend first and last samples
+  in[0] = p[-1];
+  in[1] = p[-1];
+  for (int i = 0; i < sz; i++) {
+    in[i + 2] = p[i];
+  }
+  in[sz + 2] = p[sz - 1];
+
+  // interpolate half-sample edge positions
+  p[-2] = in[0];
+  for (int i = 0; i < sz; i++) {
+    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+    s = (s + 8) >> 4;
+    s = clip_pixel_highbd(s, bd);
+    p[2 * i - 1] = s;
+    p[2 * i] = in[i + 2];
+  }
+}
+
+static void highbd_build_intra_predictors(
+    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+    PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
+    int bit_depth) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  uint16_t *const above_row = above_data + 16;
+  uint16_t *const left_col = left_data + 16;
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+  const uint16_t *left_ref = ref - 1;
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+  int base = 128 << (bit_depth - 8);
+  // The left_data, above_data buffers must be zeroed to fix some intermittent
+  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+  // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
+  // seen to be the potential reason for this issue.
+  aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+  aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+
+  // The default values if ref pixels are not available:
+  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+  // base+1   A      B  ..     Y      Z
+  // base+1   C      D  ..     W      X
+  // base+1   E      F  ..     U      V
+  // base+1   G      H  ..     S      T      T      T      T      T
+
+  if (is_dr_mode) {
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= -1);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= -1);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : base + 1;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      aom_memset16(dst, val, txwpx);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+    const int num_left_pixels_needed =
+        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (n_bottomleft_px > 0) {
+        assert(i == txhpx);
+        for (; i < txhpx + n_bottomleft_px; i++)
+          left_col[i] = left_ref[i * ref_stride];
+      }
+      if (i < num_left_pixels_needed)
+        aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+    } else if (n_top_px > 0) {
+      aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+      i = n_top_px;
+      if (n_topright_px > 0) {
+        assert(n_top_px == txwpx);
+        memcpy(above_row + txwpx, above_ref + txwpx,
+               n_topright_px * sizeof(above_ref[0]));
+        i += n_topright_px;
+      }
+      if (i < num_top_pixels_needed)
+        aom_memset16(&above_row[i], above_row[i - 1],
+                     num_top_pixels_needed - i);
+    } else if (n_left_px > 0) {
+      aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = base;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
+  if (use_filter_intra) {
+    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                                  filter_intra_mode, bit_depth);
+    return;
+  }
+
+  if (is_dr_mode) {
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          highbd_filter_intra_edge_corner(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
+        }
+      }
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
+      }
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
+      }
+    }
+    highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                        upsample_above, upsample_left, p_angle, bit_depth);
+    return;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+        dst, dst_stride, above_row, left_col, bit_depth);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
                                             int subsampling_y) {
   assert(subsampling_x >= 0 && subsampling_x < 2);
@@ -1631,7 +1631,7 @@
   const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
-    build_intra_predictors_high(
+    highbd_build_intra_predictors(
         ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
         tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
         have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
@@ -1696,7 +1696,7 @@
     } else {
       cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
     }
-    cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
+    av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
     return;
   }
   av1_predict_intra_block(
diff --git a/av1/common/resize.c b/av1/common/resize.c
index f4bfcd0..f89f7ca 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -644,17 +644,20 @@
   }
 }
 
-void av1_resize_plane(const uint8_t *const input, int height, int width,
+bool av1_resize_plane(const uint8_t *const input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride) {
   int i;
+  bool mem_status = true;
   uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
   uint8_t *tmpbuf =
       (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height));
   uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
   uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
-  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) {
+    mem_status = false;
     goto Error;
+  }
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
@@ -673,16 +676,21 @@
   aom_free(tmpbuf);
   aom_free(arrbuf);
   aom_free(arrbuf2);
+  return mem_status;
 }
 
-void av1_upscale_plane_double_prec(const double *const input, int height,
+bool av1_upscale_plane_double_prec(const double *const input, int height,
                                    int width, int in_stride, double *output,
                                    int height2, int width2, int out_stride) {
   int i;
+  bool mem_status = true;
   double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height);
   double *arrbuf = (double *)aom_malloc(sizeof(double) * height);
   double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2);
-  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) {
+    mem_status = false;
+    goto Error;
+  }
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
@@ -700,6 +708,7 @@
   aom_free(intbuf);
   aom_free(arrbuf);
   aom_free(arrbuf2);
+  return mem_status;
 }
 
 static bool upscale_normative_rect(const uint8_t *const input, int height,
@@ -1128,35 +1137,49 @@
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth) {
-  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
-                   owidth / 2, ouv_stride);
-  av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
-                   owidth / 2, ouv_stride);
+  if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                        oy_stride))
+    abort();
+  if (!av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                        owidth / 2, ouv_stride))
+    abort();
+  if (!av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                        owidth / 2, ouv_stride))
+    abort();
 }
 
-void av1_resize_frame422(const uint8_t *const y, int y_stride,
+bool av1_resize_frame422(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth) {
-  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
-                   ouv_stride);
-  av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
-                   ouv_stride);
+  if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                        oy_stride))
+    return false;
+  if (!av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+                        owidth / 2, ouv_stride))
+    return false;
+  if (!av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+                        owidth / 2, ouv_stride))
+    return false;
+  return true;
 }
 
-void av1_resize_frame444(const uint8_t *const y, int y_stride,
+bool av1_resize_frame444(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth) {
-  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
-                   ouv_stride);
-  av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
-                   ouv_stride);
+  if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                        oy_stride))
+    return false;
+  if (!av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                        ouv_stride))
+    return false;
+  if (!av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                        ouv_stride))
+    return false;
+  return true;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1251,7 +1274,7 @@
   aom_extend_frame_borders(dst, num_planes);
 }
 
-void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                               YV12_BUFFER_CONFIG *dst, int bd,
                                               const int num_planes) {
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
@@ -1261,25 +1284,29 @@
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (src->flags & YV12_FLAG_HIGHBITDEPTH)
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
       av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
                               src->crop_widths[is_uv], src->strides[is_uv],
                               dst->buffers[i], dst->crop_heights[is_uv],
                               dst->crop_widths[is_uv], dst->strides[is_uv], bd);
-    else
-      av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
-                       src->crop_widths[is_uv], src->strides[is_uv],
-                       dst->buffers[i], dst->crop_heights[is_uv],
-                       dst->crop_widths[is_uv], dst->strides[is_uv]);
+    } else if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                                 src->crop_widths[is_uv], src->strides[is_uv],
+                                 dst->buffers[i], dst->crop_heights[is_uv],
+                                 dst->crop_widths[is_uv],
+                                 dst->strides[is_uv])) {
+      return false;
+    }
 #else
     (void)bd;
-    av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
-                     src->crop_widths[is_uv], src->strides[is_uv],
-                     dst->buffers[i], dst->crop_heights[is_uv],
-                     dst->crop_widths[is_uv], dst->strides[is_uv]);
+    if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                          src->crop_widths[is_uv], src->strides[is_uv],
+                          dst->buffers[i], dst->crop_heights[is_uv],
+                          dst->crop_widths[is_uv], dst->strides[is_uv]))
+      return false;
 #endif
   }
   aom_extend_frame_borders(dst, num_planes);
+  return true;
 }
 
 void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
@@ -1410,15 +1437,19 @@
         cm->seq_params->bit_depth == AOM_BITS_8) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
     } else {
-      av1_resize_and_extend_frame_nonnormative(
-          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
+      if (!av1_resize_and_extend_frame_nonnormative(
+              unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes))
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate buffers during resize");
     }
 #else
     if (use_optimized_scaler && has_optimized_scaler) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
     } else {
-      av1_resize_and_extend_frame_nonnormative(
-          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
+      if (!av1_resize_and_extend_frame_nonnormative(
+              unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes))
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate buffers during resize");
     }
 #endif
     return scaled;
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 5927d8e..d1fab82 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -20,23 +20,25 @@
 extern "C" {
 #endif
 
-void av1_resize_plane(const uint8_t *const input, int height, int width,
+bool av1_resize_plane(const uint8_t *const input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride);
-void av1_upscale_plane_double_prec(const double *const input, int height,
+bool av1_upscale_plane_double_prec(const double *const input, int height,
                                    int width, int in_stride, double *output,
                                    int height2, int width2, int out_stride);
+// TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from
+// av1/exports_com and delete this function.
 void av1_resize_frame420(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth);
-void av1_resize_frame422(const uint8_t *const y, int y_stride,
+bool av1_resize_frame422(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth);
-void av1_resize_frame444(const uint8_t *const y, int y_stride,
+bool av1_resize_frame444(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
@@ -77,7 +79,7 @@
     const bool for_psnr, const int border_in_pixels,
     const int num_pyramid_levels);
 
-void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                               YV12_BUFFER_CONFIG *dst, int bd,
                                               const int num_planes);
 
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 822f240..a26f329 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -20,6 +20,7 @@
 #include "av1/common/av1_common_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
+#include "av1/common/thread_common.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 
@@ -39,63 +40,43 @@
   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
 };
 
-PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
-  PixelRect rect;
-
+void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
+                                  int *plane_h) {
   int ss_x = is_uv && cm->seq_params->subsampling_x;
   int ss_y = is_uv && cm->seq_params->subsampling_y;
-
-  rect.top = 0;
-  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
-  rect.left = 0;
-  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
-  return rect;
+  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
 }
 
-// Count horizontal or vertical units per tile (use a width or height for
-// tile_size, respectively). We basically want to divide the tile size by the
+// Count horizontal or vertical units in a plane (use a width or height for
+// plane_size, respectively). We basically want to divide the plane size by the
 // size of a restoration unit. Rather than rounding up unconditionally as you
 // might expect, we round to nearest, which models the way a right or bottom
-// restoration unit can extend to up to 150% its normal width or height. The
-// max with 1 is to deal with tiles that are smaller than half of a restoration
-// unit.
-int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
-  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+// restoration unit can extend to up to 150% its normal width or height.
+//
+// The max with 1 is to deal with small frames, which may be smaller than
+// half of an LR unit in size.
+int av1_lr_count_units(int unit_size, int plane_size) {
+  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
 }
 
 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
                                   int is_uv) {
-  // We need to allocate enough space for restoration units to cover the
-  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
-  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
-  // to do the computation ourselves, iterating over the tiles and keeping
-  // track of the largest width and height, then upscaling.
-  const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
-  const int max_tile_w = tile_rect.right - tile_rect.left;
-  const int max_tile_h = tile_rect.bottom - tile_rect.top;
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 
-  // To calculate hpertile and vpertile (horizontal and vertical units per
-  // tile), we basically want to divide the largest tile width or height by the
-  // size of a restoration unit. Rather than rounding up unconditionally as you
-  // might expect, we round to nearest, which models the way a right or bottom
-  // restoration unit can extend to up to 150% its normal width or height. The
-  // max with 1 is to deal with tiles that are smaller than half of a
-  // restoration unit.
   const int unit_size = rsi->restoration_unit_size;
-  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
-  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+  const int horz_units = av1_lr_count_units(unit_size, plane_w);
+  const int vert_units = av1_lr_count_units(unit_size, plane_h);
 
-  rsi->units_per_tile = hpertile * vpertile;
-  rsi->horz_units_per_tile = hpertile;
-  rsi->vert_units_per_tile = vpertile;
-
-  const int ntiles = 1;
-  const int nunits = ntiles * rsi->units_per_tile;
+  rsi->num_rest_units = horz_units * vert_units;
+  rsi->horz_units = horz_units;
+  rsi->vert_units = vert_units;
 
   aom_free(rsi->unit_info);
   CHECK_MEM_ERROR(cm, rsi->unit_info,
                   (RestorationUnitInfo *)aom_memalign(
-                      16, sizeof(*rsi->unit_info) * nunits));
+                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
 }
 
 void av1_free_restoration_struct(RestorationInfo *rst_info) {
@@ -174,8 +155,9 @@
   }
 }
 
-static void copy_tile_highbd(int width, int height, const uint16_t *src,
-                             int src_stride, uint16_t *dst, int dst_stride) {
+static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
+                                  int src_stride, uint16_t *dst,
+                                  int dst_stride) {
   for (int i = 0; i < height; ++i)
     memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
 }
@@ -194,23 +176,24 @@
   extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
 }
 
-static void copy_tile_lowbd(int width, int height, const uint8_t *src,
-                            int src_stride, uint8_t *dst, int dst_stride) {
+static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
+                                 int src_stride, uint8_t *dst, int dst_stride) {
   for (int i = 0; i < height; ++i)
     memcpy(dst + i * dst_stride, src + i * src_stride, width);
 }
 
-static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride, int highbd) {
+static void copy_rest_unit(int width, int height, const uint8_t *src,
+                           int src_stride, uint8_t *dst, int dst_stride,
+                           int highbd) {
 #if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
-    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
-                     CONVERT_TO_SHORTPTR(dst), dst_stride);
+    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+                          CONVERT_TO_SHORTPTR(dst), dst_stride);
     return;
   }
 #endif
   (void)highbd;
-  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
 }
 
 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
@@ -218,53 +201,34 @@
 // With striped loop restoration, the filtering for each 64-pixel stripe gets
 // most of its input from the output of CDEF (stored in data8), but we need to
 // fill out a border of 3 pixels above/below the stripe according to the
-// following
-// rules:
+// following rules:
 //
-// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
-//   This extension is done by a call to av1_extend_frame() at the start of the
-//   loop restoration process, so the value of copy_above/copy_below doesn't
-//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
-//   loop filtering across tiles is disabled, we can allow
-//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
-//   data has always been copied, simplifying the behaviour at the left and
-//   right edges of tiles.
+// * At the top and bottom of the frame, we copy the outermost row of CDEF
+//   pixels three times. This extension is done by a call to av1_extend_frame()
+//   at the start of the loop restoration process, so the value of
+//   copy_above/copy_below doesn't strictly matter.
 //
-// * If we're at a tile boundary and loop filtering across tiles is enabled,
-//   then there is a logical stripe which is 64 pixels high, but which is split
-//   into an 8px high and a 56px high stripe so that the processing (and
-//   coefficient set usage) can be aligned to tiles.
-//   In this case, we use the 3 rows of CDEF output across the boundary for
-//   context; this corresponds to leaving the frame buffer as-is.
-//
-// * If we're at a tile boundary and loop filtering across tiles is disabled,
-//   then we take the outermost row of CDEF pixels *within the current tile*
-//   and copy it three times. Thus we behave exactly as if the tile were a full
-//   frame.
-//
-// * Otherwise, we're at a stripe boundary within a tile. In that case, we
-//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
-//
-// The distinction between the latter two cases is handled by the
-// av1_loop_restoration_save_boundary_lines() function, so here we just need
-// to decide if we're overwriting the above/below boundary pixels or not.
+// * All other boundaries are stripe boundaries within the frame. In that case,
+//   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
-                                     const PixelRect *tile_rect, int ss_y,
+                                     int plane_w, int plane_h, int ss_y,
                                      int *copy_above, int *copy_below) {
+  (void)plane_w;
+
   *copy_above = 1;
   *copy_below = 1;
 
   const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
 
-  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+  const int first_stripe_in_plane = (limits->v_start == 0);
   const int this_stripe_height =
-      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
-  const int last_stripe_in_tile =
-      (limits->v_start + this_stripe_height >= tile_rect->bottom);
+      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
+  const int last_stripe_in_plane =
+      (limits->v_start + this_stripe_height >= plane_h);
 
-  if (first_stripe_in_tile) *copy_above = 0;
-  if (last_stripe_in_tile) *copy_below = 0;
+  if (first_stripe_in_plane) *copy_above = 0;
+  if (last_stripe_in_plane) *copy_below = 0;
 }
 
 // Overwrite the border pixels around a processing stripe so that the conditions
@@ -276,10 +240,6 @@
 // limits gives the rectangular limits of the remaining stripes for the current
 // restoration unit. rsb is the stored stripe boundaries (taken from either
 // deblock or CDEF output as necessary).
-//
-// tile_rect is the limits of the current tile and tile_stripe0 is the index of
-// the first stripe in this tile (needed to convert the tile-relative stripe
-// index we get from limits into something we can look up in rsb).
 static void setup_processing_stripe_boundary(
     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
@@ -300,12 +260,6 @@
   // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
   // duplicating the topmost of the 2 lines (see the AOMMAX call when
   // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
-  //
-  // Special case: If we're at the top of a tile, which isn't on the topmost
-  // tile row, and we're allowed to loop filter across tiles, then we have a
-  // logical 64-pixel-high stripe which has been split into an 8-pixel high
-  // stripe and a 56-pixel high stripe (the current one). So, in this case,
-  // we want to leave the boundary alone!
   if (!opt) {
     if (copy_above) {
       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
@@ -370,19 +324,9 @@
   }
 }
 
-// This function restores the boundary lines modified by
-// setup_processing_stripe_boundary.
-//
-// Note: We need to be careful when handling the corners of the processing
-// unit, because (eg.) the top-left corner is considered to be part of
-// both the left and top borders. This means that, depending on the
-// loop_filter_across_tiles_enabled flag, the corner pixels might get
-// overwritten twice, once as part of the "top" border and once as part
-// of the "left" border (or similar for other corners).
-//
-// Everything works out fine as long as we make sure to reverse the order
-// when restoring, ie. we need to restore the left/right borders followed
-// by the top/bottom borders.
+// Once a processing stripe is finished, this function sets the boundary
+// pixels which were overwritten by setup_processing_stripe_boundary()
+// back to their original values
 static void restore_processing_stripe_boundary(
     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
@@ -440,11 +384,13 @@
                                  int stripe_width, int stripe_height,
                                  int procunit_width, const uint8_t *src,
                                  int src_stride, uint8_t *dst, int dst_stride,
-                                 int32_t *tmpbuf, int bit_depth) {
+                                 int32_t *tmpbuf, int bit_depth,
+                                 struct aom_internal_error_info *error_info) {
   (void)tmpbuf;
   (void)bit_depth;
+  (void)error_info;
   assert(bit_depth == 8);
-  const ConvolveParams conv_params = get_conv_params_wiener(8);
+  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
@@ -908,19 +854,18 @@
   return 0;
 }
 
-void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
-                                        int height, int stride, int eps,
-                                        const int *xqd, uint8_t *dst8,
-                                        int dst_stride, int32_t *tmpbuf,
-                                        int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+                                       int height, int stride, int eps,
+                                       const int *xqd, uint8_t *dst8,
+                                       int dst_stride, int32_t *tmpbuf,
+                                       int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
 
   const int ret = av1_selfguided_restoration_c(
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
-  (void)ret;
-  assert(!ret);
+  if (ret != 0) return ret;
   const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
   av1_decode_xq(xqd, xq, params);
@@ -947,33 +892,40 @@
         *dst8ij = (uint8_t)out;
     }
   }
+  return 0;
 }
 
 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
                                   int stripe_width, int stripe_height,
                                   int procunit_width, const uint8_t *src,
                                   int src_stride, uint8_t *dst, int dst_stride,
-                                  int32_t *tmpbuf, int bit_depth) {
+                                  int32_t *tmpbuf, int bit_depth,
+                                  struct aom_internal_error_info *error_info) {
   (void)bit_depth;
   assert(bit_depth == 8);
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-    av1_apply_selfguided_restoration(
-        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
-        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
+    if (av1_apply_selfguided_restoration(
+            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
+            0) != 0) {
+      aom_internal_error(
+          error_info, AOM_CODEC_MEM_ERROR,
+          "Error allocating buffer in av1_apply_selfguided_restoration");
+    }
   }
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
-                                        int stripe_width, int stripe_height,
-                                        int procunit_width, const uint8_t *src8,
-                                        int src_stride, uint8_t *dst8,
-                                        int dst_stride, int32_t *tmpbuf,
-                                        int bit_depth) {
+static void wiener_filter_stripe_highbd(
+    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
+    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int bit_depth,
+    struct aom_internal_error_info *error_info) {
   (void)tmpbuf;
-  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+  (void)error_info;
+  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
@@ -986,17 +938,21 @@
   }
 }
 
-static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
-                                         int stripe_width, int stripe_height,
-                                         int procunit_width,
-                                         const uint8_t *src8, int src_stride,
-                                         uint8_t *dst8, int dst_stride,
-                                         int32_t *tmpbuf, int bit_depth) {
+static void sgrproj_filter_stripe_highbd(
+    const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
+    int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int bit_depth,
+    struct aom_internal_error_info *error_info) {
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-    av1_apply_selfguided_restoration(
-        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
-        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+    if (av1_apply_selfguided_restoration(
+            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
+            1) != 0) {
+      aom_internal_error(
+          error_info, AOM_CODEC_MEM_ERROR,
+          "Error allocating buffer in av1_apply_selfguided_restoration");
+    }
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
@@ -1005,7 +961,8 @@
                                   int stripe_width, int stripe_height,
                                   int procunit_width, const uint8_t *src,
                                   int src_stride, uint8_t *dst, int dst_stride,
-                                  int32_t *tmpbuf, int bit_depth);
+                                  int32_t *tmpbuf, int bit_depth,
+                                  struct aom_internal_error_info *error_info);
 
 #if CONFIG_AV1_HIGHBITDEPTH
 #define NUM_STRIPE_FILTERS 4
@@ -1024,9 +981,9 @@
 void av1_loop_restoration_filter_unit(
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
-    const PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
-    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
-    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
+    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
+    int optimized_lr, struct aom_internal_error_info *error_info) {
   RestorationType unit_rtype = rui->restoration_type;
 
   int unit_h = limits->v_end - limits->v_start;
@@ -1035,7 +992,8 @@
   uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
 
   if (unit_rtype == RESTORE_NONE) {
-    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
+    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
+                   highbd);
     return;
   }
 
@@ -1045,32 +1003,30 @@
 
   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
 
-  // Convolve the whole tile one stripe at a time
+  // Filter the whole image one stripe at a time
   RestorationTileLimits remaining_stripes = *limits;
   int i = 0;
   while (i < unit_h) {
     int copy_above, copy_below;
     remaining_stripes.v_start = limits->v_start + i;
 
-    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
-                             &copy_below);
+    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
+                             &copy_above, &copy_below);
 
     const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
     const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
 
     // Work out where this stripe's boundaries are within
     // rsb->stripe_boundary_{above,below}
-    const int tile_stripe =
-        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
-        full_stripe_height;
-    const int frame_stripe = tile_stripe0 + tile_stripe;
+    const int frame_stripe =
+        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
     const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
 
     // Calculate this stripe's height, based on two rules:
-    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
     // * We can't extend past the end of the current restoration unit
     const int nominal_stripe_height =
-        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
     const int h = AOMMIN(nominal_stripe_height,
                          remaining_stripes.v_end - remaining_stripes.v_start);
 
@@ -1079,7 +1035,8 @@
                                      copy_below, optimized_lr);
 
     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
-                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
+                  error_info);
 
     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
                                        data8, stride, copy_above, copy_below,
@@ -1090,17 +1047,17 @@
 }
 
 static void filter_frame_on_unit(const RestorationTileLimits *limits,
-                                 const PixelRect *tile_rect, int rest_unit_idx,
-                                 void *priv, int32_t *tmpbuf,
-                                 RestorationLineBuffers *rlbs) {
+                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 RestorationLineBuffers *rlbs,
+                                 struct aom_internal_error_info *error_info) {
   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
   const RestorationInfo *rsi = ctxt->rsi;
 
   av1_loop_restoration_filter_unit(
-      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
-      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
-      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
-      rsi->optimized_lr);
+      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
+      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
+      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
+      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
 }
 
 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
@@ -1127,31 +1084,33 @@
     RestorationInfo *rsi = &cm->rst_info[plane];
     RestorationType rtype = rsi->frame_restoration_type;
     rsi->optimized_lr = optimized_lr;
+    lr_ctxt->ctxt[plane].rsi = rsi;
 
     if (rtype == RESTORE_NONE) {
       continue;
     }
 
     const int is_uv = plane > 0;
-    const int plane_width = frame->crop_widths[is_uv];
-    const int plane_height = frame->crop_heights[is_uv];
-    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+    int plane_w, plane_h;
+    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+    assert(plane_w == frame->crop_widths[is_uv]);
+    assert(plane_h == frame->crop_heights[is_uv]);
 
-    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
+    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
                      frame->strides[is_uv], RESTORATION_BORDER,
                      RESTORATION_BORDER, highbd);
 
-    lr_plane_ctxt->rsi = rsi;
+    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
+    lr_plane_ctxt->plane_w = plane_w;
+    lr_plane_ctxt->plane_h = plane_h;
     lr_plane_ctxt->highbd = highbd;
     lr_plane_ctxt->bit_depth = bit_depth;
     lr_plane_ctxt->data8 = frame->buffers[plane];
     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
     lr_plane_ctxt->data_stride = frame->strides[is_uv];
     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
-    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
-    lr_plane_ctxt->tile_stripe0 = 0;
   }
 }
 
@@ -1166,9 +1125,9 @@
   assert(num_planes <= 3);
   for (int plane = 0; plane < num_planes; ++plane) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
-    PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
-    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
-                     tile_rect.right, tile_rect.top, tile_rect.bottom);
+    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
+    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
+                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
   }
 }
 
@@ -1182,8 +1141,7 @@
     }
 
     av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
-                                   &ctxt[plane], &ctxt[plane].tile_rect,
-                                   cm->rst_tmpbuf, cm->rlbs);
+                                   &ctxt[plane], cm->rst_tmpbuf, cm->rlbs);
   }
 }
 
@@ -1204,24 +1162,23 @@
 }
 
 void av1_foreach_rest_unit_in_row(
-    RestorationTileLimits *limits, const PixelRect *tile_rect,
+    RestorationTileLimits *limits, int plane_w,
     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
-    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
-    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
-    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
-    struct AV1LrSyncData *const lr_sync) {
-  const int tile_w = tile_rect->right - tile_rect->left;
+    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
+    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
+    struct aom_internal_error_info *error_info) {
   const int ext_size = unit_size * 3 / 2;
   int x0 = 0, j = 0;
-  while (x0 < tile_w) {
-    int remaining_w = tile_w - x0;
+  while (x0 < plane_w) {
+    int remaining_w = plane_w - x0;
     int w = (remaining_w < ext_size) ? remaining_w : unit_size;
 
-    limits->h_start = tile_rect->left + x0;
-    limits->h_end = tile_rect->left + x0 + w;
-    assert(limits->h_end <= tile_rect->right);
+    limits->h_start = x0;
+    limits->h_end = x0 + w;
+    assert(limits->h_end <= plane_w);
 
-    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
+    const int unit_idx = row_number * hnum_rest_units + j;
 
     // No sync for even numbered rows
     // For odd numbered rows, Loop Restoration of current block requires the LR
@@ -1229,13 +1186,23 @@
 
     // top-right sync
     on_sync_read(lr_sync, row_number, j, plane);
-    if ((row_number + 1) < vunits_per_tile)
+    if ((row_number + 1) < vnum_rest_units)
       // bottom-right sync
       on_sync_read(lr_sync, row_number + 2, j, plane);
 
-    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+#if CONFIG_MULTITHREAD
+    if (lr_sync && lr_sync->num_workers > 1) {
+      pthread_mutex_lock(lr_sync->job_mutex);
+      const bool lr_mt_exit = lr_sync->lr_mt_exit;
+      pthread_mutex_unlock(lr_sync->job_mutex);
+      // Exit in case any worker has encountered an error.
+      if (lr_mt_exit) return;
+    }
+#endif
 
-    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
+    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
+
+    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
 
     x0 += w;
     ++j;
@@ -1258,57 +1225,45 @@
   (void)plane;
 }
 
-static void foreach_rest_unit_in_tile(
-    const PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
-    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
-    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
-    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
-  const int tile_h = tile_rect->bottom - tile_rect->top;
-  const int ext_size = unit_size * 3 / 2;
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  const int hnum_rest_units = rsi->horz_units;
+  const int vnum_rest_units = rsi->vert_units;
+  const int unit_size = rsi->restoration_unit_size;
 
-  const int tile_idx = tile_col + tile_row * tile_cols;
-  const int unit_idx0 = tile_idx * units_per_tile;
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
+  const int ext_size = unit_size * 3 / 2;
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 
   int y0 = 0, i = 0;
-  while (y0 < tile_h) {
-    int remaining_h = tile_h - y0;
+  while (y0 < plane_h) {
+    int remaining_h = plane_h - y0;
     int h = (remaining_h < ext_size) ? remaining_h : unit_size;
 
     RestorationTileLimits limits;
-    limits.v_start = tile_rect->top + y0;
-    limits.v_end = tile_rect->top + y0 + h;
-    assert(limits.v_end <= tile_rect->bottom);
-    // Offset the tile upwards to align with the restoration processing stripe
+    limits.v_start = y0;
+    limits.v_end = y0 + h;
+    assert(limits.v_end <= plane_h);
+    // Offset upwards to align with the restoration processing stripe
     const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
-    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
-    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+    limits.v_start = AOMMAX(0, limits.v_start - voffset);
+    if (limits.v_end < plane_h) limits.v_end -= voffset;
 
-    av1_foreach_rest_unit_in_row(
-        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
-        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
-        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
+                                 hnum_rest_units, vnum_rest_units, plane, priv,
+                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
+                                 av1_lr_sync_write_dummy, NULL, cm->error);
 
     y0 += h;
     ++i;
   }
 }
 
-void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
-                                    rest_unit_visitor_t on_rest_unit,
-                                    void *priv, PixelRect *tile_rect,
-                                    int32_t *tmpbuf,
-                                    RestorationLineBuffers *rlbs) {
-  const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params->subsampling_y;
-
-  const RestorationInfo *rsi = &cm->rst_info[plane];
-
-  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
-                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
-                            rsi->units_per_tile, rsi->restoration_unit_size,
-                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
-}
-
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
@@ -1316,33 +1271,21 @@
   assert(rcol0 && rcol1 && rrow0 && rrow1);
 
   if (bsize != cm->seq_params->sb_size) return 0;
-  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
 
   assert(!cm->features.all_lossless);
 
   const int is_uv = plane > 0;
 
-  const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
-  const int tile_w = tile_rect.right - tile_rect.left;
-  const int tile_h = tile_rect.bottom - tile_rect.top;
-
-  const int mi_top = 0;
-  const int mi_left = 0;
-
-  // Compute the mi-unit corners of the superblock relative to the top-left of
-  // the tile
-  const int mi_rel_row0 = mi_row - mi_top;
-  const int mi_rel_col0 = mi_col - mi_left;
-  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
-  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+  // Compute the mi-unit corners of the superblock
+  const int mi_row0 = mi_row;
+  const int mi_col0 = mi_col;
+  const int mi_row1 = mi_row0 + mi_size_high[bsize];
+  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
 
   const RestorationInfo *rsi = &cm->rst_info[plane];
   const int size = rsi->restoration_unit_size;
-
-  // Calculate the number of restoration units in this tile (which might be
-  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
-  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
-  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+  const int horz_units = rsi->horz_units;
+  const int vert_units = rsi->vert_units;
 
   // The size of an MI-unit on this plane of the image
   const int ss_x = is_uv && cm->seq_params->subsampling_x;
@@ -1367,19 +1310,18 @@
   const int rnd_x = denom_x - 1;
   const int rnd_y = denom_y - 1;
 
-  // rcol0/rrow0 should be the first column/row of restoration units (relative
-  // to the top-left of the tile) that doesn't start left/below of
-  // mi_col/mi_row. For this calculation, we need to round up the division (if
-  // the sb starts at runit column 10.1, the first matching runit has column
-  // index 11)
-  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
-  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+  // rcol0/rrow0 should be the first column/row of restoration units that
+  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
+  // to round up the division (if the sb starts at runit column 10.1, the first
+  // matching runit has column index 11)
+  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
+  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
 
   // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
-  // below-right. If we're at the bottom or right of the tile, this restoration
+  // below-right. If we're at the bottom or right of the frame, this restoration
   // unit might not exist, in which case we'll clamp accordingly.
-  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
-  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
 
   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
 }
@@ -1480,73 +1422,59 @@
                                  : src_width;
   const int line_bytes = upscaled_width << use_highbd;
   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
-    // Copy the line at 'row' into both context lines. This is because
-    // we want to (effectively) extend the outermost row of CDEF data
-    // from this tile to produce a border, rather than using deblocked
-    // pixels from the tile above/below.
+    // Copy the line at 'src_rows' into both context lines
     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
   }
   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
                RESTORATION_EXTRA_HORZ, use_highbd);
 }
 
-static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
-                                         int use_highbd, int plane,
-                                         AV1_COMMON *cm, int after_cdef) {
+static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
+                                int plane, AV1_COMMON *cm, int after_cdef) {
   const int is_uv = plane > 0;
   const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
 
-  // Get the tile rectangle, with height rounded up to the next multiple of 8
-  // luma pixels (only relevant for the bottom tile of the frame)
-  const PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
-  const int stripe0 = 0;
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 
   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
 
   const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
 
-  int tile_stripe;
-  for (tile_stripe = 0;; ++tile_stripe) {
-    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
-    const int y0 = tile_rect.top + rel_y0;
-    if (y0 >= tile_rect.bottom) break;
+  int stripe_idx;
+  for (stripe_idx = 0;; ++stripe_idx) {
+    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
+    const int y0 = rel_y0;
+    if (y0 >= plane_h) break;
 
-    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
-    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
+    const int y1 = AOMMIN(rel_y1, plane_h);
 
-    const int frame_stripe = stripe0 + tile_stripe;
-
-    // In this case, we should only use CDEF pixels at the top
-    // and bottom of the frame as a whole; internal tile boundaries
-    // can use deblocked pixels from adjacent tiles for context.
-    const int use_deblock_above = (frame_stripe > 0);
+    // Extend using CDEF pixels at the top and bottom of the frame,
+    // and deblocked pixels at internal stripe boundaries
+    const int use_deblock_above = (stripe_idx > 0);
     const int use_deblock_below = (y1 < plane_height);
 
     if (!after_cdef) {
-      // Save deblocked context where needed.
+      // Save deblocked context at internal stripe boundaries
       if (use_deblock_above) {
         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
-                                    frame_stripe, use_highbd, 1, boundaries);
+                                    stripe_idx, use_highbd, 1, boundaries);
       }
       if (use_deblock_below) {
-        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
                                     use_highbd, 0, boundaries);
       }
     } else {
-      // Save CDEF context where needed. Note that we need to save the CDEF
-      // context for a particular boundary iff we *didn't* save deblocked
-      // context for that boundary.
-      //
-      // In addition, we need to save copies of the outermost line within
-      // the tile, rather than using data from outside the tile.
+      // Save CDEF context at frame boundaries
       if (!use_deblock_above) {
-        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
                                  1, boundaries);
       }
       if (!use_deblock_below) {
-        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
                                  use_highbd, 0, boundaries);
       }
     }
@@ -1561,6 +1489,6 @@
   const int num_planes = av1_num_planes(cm);
   const int use_highbd = cm->seq_params->use_highbitdepth;
   for (int p = 0; p < num_planes; ++p) {
-    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   }
 }
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index bf21303..d5da81d 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -33,7 +33,7 @@
 
 #define RESTORATION_PROC_UNIT_SIZE 64
 
-// Filter tile grid offset upwards compared to the superblock grid
+// Filter stripe grid offset upwards compared to the superblock grid
 #define RESTORATION_UNIT_OFFSET 8
 
 #define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
@@ -180,10 +180,6 @@
 #error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
 #endif
 
-#define LR_TILE_ROW 0
-#define LR_TILE_COL 0
-#define LR_TILE_COLS 1
-
 typedef struct {
   int r[2];  // radii
   int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
@@ -215,12 +211,6 @@
 #define RESTORATION_LINEBUFFER_WIDTH \
   (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
 
-// Similarly, the column buffers (used when we're at a vertical tile edge
-// that we can't filter across) need space for one processing unit's worth
-// of pixels, plus the top/bottom border width
-#define RESTORATION_COLBUFFER_HEIGHT \
-  (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
-
 typedef struct {
   // Temporary buffers to save/restore 3 lines above/below the restoration
   // stripe.
@@ -266,32 +256,26 @@
 
   /**
    * \name Fields allocated and initialised by av1_alloc_restoration_struct.
-   * (horz_)units_per_tile give the number of restoration units in
-   * (one row of) the largest tile in the frame.
    */
   /**@{*/
   /*!
-   * Number of units per tile for the largest tile in the frame
+   * Total number of restoration units in this plane
    */
-  int units_per_tile;
+  int num_rest_units;
 
   /*!
-   * Number of vertical units per tile
+   * Number of vertical restoration units in this plane
    */
-  int vert_units_per_tile;
+  int vert_units;
 
   /*!
-   * Number of horizontal units per tile for the largest tile in the frame
+   * Number of horizontal restoration units in this plane
    */
-  int horz_units_per_tile;
+  int horz_units;
   /**@}*/
 
   /*!
-   * List of info for units in tile.
-   * The data in unit_info is laid out with units_per_tile entries for each
-   * tile, which have stride horz_units_per_tile.
-   * Even if there are tiles of different sizes, the data in unit_info is
-   * laid out as if all tiles are of full size.
+   * Parameters for each restoration unit in this plane
    */
   RestorationUnitInfo *unit_info;
 
@@ -332,19 +316,18 @@
 } RestorationTileLimits;
 
 typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
-                                    const PixelRect *tile_rect,
                                     int rest_unit_idx, void *priv,
                                     int32_t *tmpbuf,
-                                    RestorationLineBuffers *rlbs);
+                                    RestorationLineBuffers *rlbs,
+                                    struct aom_internal_error_info *error_info);
 
 typedef struct FilterFrameCtxt {
   const RestorationInfo *rsi;
-  int tile_stripe0;
   int ss_x, ss_y;
+  int plane_w, plane_h;
   int highbd, bit_depth;
   uint8_t *data8, *dst8;
   int data_stride, dst_stride;
-  PixelRect tile_rect;
 } FilterFrameCtxt;
 
 typedef struct AV1LrStruct {
@@ -375,27 +358,29 @@
  * This function applies the loop restoration filter to a single
  * loop restoration unit.
  *
- * \param[in]  limits        Limits of the unit
- * \param[in]  rui           The parameters to use for this unit and its
- *                           coefficients
- * \param[in]  rsb           Deblocked pixels to use for stripe boundaries
- * \param[in]  rlbs          Space to use as a scratch buffer
- * \param[in]  tile_rect     Limits of the tile containing this unit
- * \param[in]  tile_stripe0  Index of the first stripe in this tile
- * \param[in]  ss_x          Horizontal subsampling for plane
- * \param[in]  ss_y          Vertical subsampling for plane
- * \param[in]  highbd        Whether high bitdepth pipeline is used
- * \param[in]  bit_depth     Bit-depth of the video
- * \param[in]  data8         Frame data (pointing at the top-left corner of
- *                           the frame, not the restoration unit).
- * \param[in]  stride        Stride of \c data8
- * \param[out] dst8          Buffer where the results will be written. Like
- *                           \c data8, \c dst8 should point at the top-left
- *                           corner of the frame
- * \param[in]  dst_stride    Stride of \c dst8
- * \param[in]  tmpbuf        Scratch buffer used by the sgrproj filter which
- *                           should be at least SGRPROJ_TMPBUF_SIZE big.
- * \param[in]  optimized_lr  Whether to use fast optimized Loop Restoration
+ * \param[in]       limits        Limits of the unit
+ * \param[in]       rui           The parameters to use for this unit and its
+ *                                coefficients
+ * \param[in]       rsb           Deblocked pixels to use for stripe boundaries
+ * \param[in]       rlbs          Space to use as a scratch buffer
+ * \param[in]       ss_x          Horizontal subsampling for plane
+ * \param[in]       ss_y          Vertical subsampling for plane
+ * \param[in]       plane_w       Width of the current plane
+ * \param[in]       plane_h       Height of the current plane
+ * \param[in]       highbd        Whether high bitdepth pipeline is used
+ * \param[in]       bit_depth     Bit-depth of the video
+ * \param[in]       data8         Frame data (pointing at the top-left corner of
+ *                                the frame, not the restoration unit).
+ * \param[in]       stride        Stride of \c data8
+ * \param[out]      dst8          Buffer where the results will be written. Like
+ *                                \c data8, \c dst8 should point at the top-left
+ *                                corner of the frame
+ * \param[in]       dst_stride    Stride of \c dst8
+ * \param[in]       tmpbuf        Scratch buffer used by the sgrproj filter
+ *                                which should be at least SGRPROJ_TMPBUF_SIZE
+ *                                big.
+ * \param[in]       optimized_lr  Whether to use fast optimized Loop Restoration
+ * \param[in,out]   error_info    Error info for reporting errors
  *
  * \remark Nothing is returned. Instead, the filtered unit is output in
  * \c dst8 at the proper restoration unit offset.
@@ -403,17 +388,17 @@
 void av1_loop_restoration_filter_unit(
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
-    const PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
-    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
-    int dst_stride, int32_t *tmpbuf, int optimized_lr);
+    int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
+    uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
+    int optimized_lr, struct aom_internal_error_info *error_info);
 
 /*!\brief Function for applying loop restoration filter to a frame
  *
  * \ingroup in_loop_restoration
  * This function applies the loop restoration filter to a frame.
  *
- * \param[in, out]  frame         Compressed frame buffer
- * \param[in, out]  cm            Pointer to top level common structure
+ * \param[in,out]   frame         Compressed frame buffer
+ * \param[in,out]   cm            Pointer to top level common structure
  * \param[in]       optimized_lr  Whether to use fast optimized Loop Restoration
  * \param[in]       lr_ctxt       Loop restoration context
  *
@@ -427,8 +412,6 @@
 
 void av1_loop_restoration_precal();
 
-typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
-                                          void *priv);
 struct AV1LrSyncData;
 
 typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
@@ -439,8 +422,7 @@
 // Call on_rest_unit for each loop restoration unit in the plane.
 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     rest_unit_visitor_t on_rest_unit,
-                                    void *priv, PixelRect *tile_rect,
-                                    int32_t *tmpbuf,
+                                    void *priv, int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs);
 
 // Return 1 iff the block at mi_row, mi_col with size bsize is a
@@ -448,10 +430,9 @@
 // loop restoration unit.
 //
 // If the block is a top-level superblock, the function writes to
-// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
-// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
-// to the current tile, whose starting index is returned as
-// *tile_tl_idx.
+// *rcol0, *rcol1, *rrow0, *rrow1. This means that the parameters for all
+// restoration units in the rectangle [*rcol0, *rcol1) x [*rrow0, *rrow1)
+// are signaled in this superblock.
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
@@ -467,14 +448,16 @@
 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
                                       struct AV1Common *cm, int num_planes);
 void av1_foreach_rest_unit_in_row(
-    RestorationTileLimits *limits, const PixelRect *tile_rect,
+    RestorationTileLimits *limits, int plane_w,
     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
-    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
-    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
-    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
-    struct AV1LrSyncData *const lr_sync);
-PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
-int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+    int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
+    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
+    struct aom_internal_error_info *error_info);
+
+void av1_get_upsampled_plane_size(const struct AV1Common *cm, int is_uv,
+                                  int *plane_w, int *plane_h);
+int av1_lr_count_units(int unit_size, int plane_size);
 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
                              const int sb_cols, int plane);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 6a4427b..8a6f290 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -57,6 +57,7 @@
 void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
                            int width, int num_workers) {
   lf_sync->rows = rows;
+  lf_sync->lf_mt_exit = false;
 #if CONFIG_MULTITHREAD
   {
     int i, j;
@@ -252,8 +253,12 @@
     const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
     struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
     int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+    struct aom_internal_error_info *error_info,
     AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf,
     int num_mis_in_lpf_unit_height_log2) {
+  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+  // in future to handle error propagation.
+  (void)error_info;
   const int sb_cols =
       CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
   const int r = mi_row >> num_mis_in_lpf_unit_height_log2;
@@ -300,6 +305,16 @@
         sync_read(lf_sync, r + 1, c, plane);
       }
 
+#if CONFIG_MULTITHREAD
+      if (lf_sync && lf_sync->num_workers > 1) {
+        pthread_mutex_lock(lf_sync->job_mutex);
+        const bool lf_mt_exit = lf_sync->lf_mt_exit;
+        pthread_mutex_unlock(lf_sync->job_mutex);
+        // Exit in case any worker has encountered an error.
+        if (lf_mt_exit) return;
+      }
+#endif
+
       av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
                            mi_row, mi_col, plane, plane + num_planes);
       if (lpf_opt_level) {
@@ -320,27 +335,93 @@
   }
 }
 
+void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
+                                   int num_mis_in_lpf_unit_height_log2) {
+  int plane, sb_row;
+  const int sb_cols =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, num_mis_in_lpf_unit_height_log2);
+  const int sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
+
+  // In case of loopfilter row-multithreading, the worker on an SB row waits for
+  // the vertical edge filtering of the right and top-right SBs. Hence, in case
+  // a thread (main/worker) encounters an error, update that vertical
+  // loopfiltering of every SB row in the frame is complete in order to avoid
+  // dependent workers waiting indefinitely.
+  for (sb_row = 0; sb_row < sb_rows; ++sb_row)
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane);
+}
+
+static AOM_INLINE void sync_lf_workers(AVxWorker *const workers,
+                                       AV1_COMMON *const cm, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int had_error = workers[0].had_error;
+  struct aom_internal_error_info error_info;
+
+  // Read the error_info of main thread.
+  if (had_error) {
+    AVxWorker *const worker = &workers[0];
+    error_info = ((LFWorkerData *)worker->data2)->error_info;
+  }
+
+  // Wait till all rows are finished.
+  for (int i = num_workers - 1; i > 0; --i) {
+    AVxWorker *const worker = &workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error_info = ((LFWorkerData *)worker->data2)->error_info;
+    }
+  }
+  if (had_error)
+    aom_internal_error(cm->error, error_info.error_code, "%s",
+                       error_info.detail);
+}
+
 // Row-based multi-threaded loopfilter hook
 static int loop_filter_row_worker(void *arg1, void *arg2) {
   AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
   LFWorkerData *const lf_data = (LFWorkerData *)arg2;
   AV1LfMTInfo *cur_job_info;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex_ = lf_sync->job_mutex;
+#endif
+
+  struct aom_internal_error_info *const error_info = &lf_data->error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(job_mutex_);
+    lf_sync->lf_mt_exit = true;
+    pthread_mutex_unlock(job_mutex_);
+#endif
+    av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
   while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
     const int lpf_opt_level = cur_job_info->lpf_opt_level;
     av1_thread_loop_filter_rows(
         lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
         cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
-        lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
-        MAX_MIB_SIZE_LOG2);
+        lpf_opt_level, lf_sync, error_info, lf_data->params_buf,
+        lf_data->tx_buf, MAX_MIB_SIZE_LOG2);
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 MACROBLOCKD *xd, int start, int stop,
-                                const int planes_to_lf[3], AVxWorker *workers,
-                                int num_workers, AV1LfSync *lf_sync,
-                                int lpf_opt_level) {
+                                const int planes_to_lf[MAX_MB_PLANE],
+                                AVxWorker *workers, int num_workers,
+                                AV1LfSync *lf_sync, int lpf_opt_level) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int i;
   loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync,
@@ -359,6 +440,7 @@
     loop_filter_data_reset(lf_data, frame, cm, xd);
 
     // Start loopfiltering
+    worker->had_error = 0;
     if (i == 0) {
       winterface->execute(worker);
     } else {
@@ -366,15 +448,13 @@
     }
   }
 
-  // Wait till all rows are finished
-  for (i = 1; i < num_workers; ++i) {
-    winterface->sync(&workers[i]);
-  }
+  sync_lf_workers(workers, cm, num_workers);
 }
 
 static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                              MACROBLOCKD *xd, int start, int stop,
-                             const int planes_to_lf[3], int lpf_opt_level) {
+                             const int planes_to_lf[MAX_MB_PLANE],
+                             int lpf_opt_level) {
   // Filter top rows of all planes first, in case the output can be partially
   // reconstructed row by row.
   int mi_row, plane, dir;
@@ -382,7 +462,7 @@
   AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
   TX_SIZE tx_buf[MAX_MIB_SIZE];
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (plane = 0; plane < 3; ++plane) {
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
         continue;
       }
@@ -390,7 +470,8 @@
       for (dir = 0; dir < 2; ++dir) {
         av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane,
                                     dir, lpf_opt_level, /*lf_sync=*/NULL,
-                                    params_buf, tx_buf, MAX_MIB_SIZE_LOG2);
+                                    xd->error_info, params_buf, tx_buf,
+                                    MAX_MIB_SIZE_LOG2);
       }
     }
   }
@@ -402,7 +483,7 @@
                               int num_workers, AV1LfSync *lf_sync,
                               int lpf_opt_level) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
-  int planes_to_lf[3];
+  int planes_to_lf[MAX_MB_PLANE];
 
   if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start,
                                    plane_end))
@@ -536,6 +617,7 @@
   }
 
   lr_sync->num_workers = num_workers;
+  lr_sync->lr_mt_exit = false;
 
   for (int j = 0; j < num_planes; j++) {
     CHECK_MEM_ERROR(
@@ -611,7 +693,7 @@
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     num_even_lr_jobs =
-        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1);
   }
   lr_job_counter[0] = 0;
   lr_job_counter[1] = num_even_lr_jobs;
@@ -620,26 +702,23 @@
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
     const int ss_y = is_uv && cm->seq_params->subsampling_y;
-
-    PixelRect tile_rect = ctxt[plane].tile_rect;
     const int unit_size = ctxt[plane].rsi->restoration_unit_size;
-
-    const int tile_h = tile_rect.bottom - tile_rect.top;
+    const int plane_h = ctxt[plane].plane_h;
     const int ext_size = unit_size * 3 / 2;
 
     int y0 = 0, i = 0;
-    while (y0 < tile_h) {
-      int remaining_h = tile_h - y0;
+    while (y0 < plane_h) {
+      int remaining_h = plane_h - y0;
       int h = (remaining_h < ext_size) ? remaining_h : unit_size;
 
       RestorationTileLimits limits;
-      limits.v_start = tile_rect.top + y0;
-      limits.v_end = tile_rect.top + y0 + h;
-      assert(limits.v_end <= tile_rect.bottom);
-      // Offset the tile upwards to align with the restoration processing stripe
+      limits.v_start = y0;
+      limits.v_end = y0 + h;
+      assert(limits.v_end <= plane_h);
+      // Offset upwards to align with the restoration processing stripe
       const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
-      limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
-      if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+      limits.v_start = AOMMAX(0, limits.v_start - voffset);
+      if (limits.v_end < plane_h) limits.v_end -= voffset;
 
       assert(lr_job_counter[0] <= num_even_lr_jobs);
 
@@ -654,18 +733,18 @@
         lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
             limits.v_end - RESTORATION_BORDER;
         if (i == 0) {
-          assert(limits.v_start == tile_rect.top);
-          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+          assert(limits.v_start == 0);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0;
         }
-        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
-          assert(limits.v_end == tile_rect.bottom);
-          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        if (i == (ctxt[plane].rsi->vert_units - 1)) {
+          assert(limits.v_end == plane_h);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h;
         }
       } else {
         lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
-            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+            AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
         lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
-            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+            AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h);
       }
       lr_job_counter[i & 1]++;
       lr_sync->jobs_enqueued++;
@@ -682,7 +761,7 @@
 #if CONFIG_MULTITHREAD
   pthread_mutex_lock(lr_sync->job_mutex);
 
-  if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+  if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
     cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
     lr_sync->jobs_dequeued++;
   }
@@ -695,6 +774,26 @@
   return cur_job_info;
 }
 
+static void set_loop_restoration_done(AV1LrSync *const lr_sync,
+                                      FilterFrameCtxt *const ctxt) {
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (ctxt[plane].rsi->frame_restoration_type == RESTORE_NONE) continue;
+    int y0 = 0, row_number = 0;
+    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+    const int plane_h = ctxt[plane].plane_h;
+    const int ext_size = unit_size * 3 / 2;
+    const int hnum_rest_units = ctxt[plane].rsi->horz_units;
+    while (y0 < plane_h) {
+      const int remaining_h = plane_h - y0;
+      const int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+      lr_sync_write(lr_sync, row_number, hnum_rest_units - 1, hnum_rest_units,
+                    plane);
+      y0 += h;
+      ++row_number;
+    }
+  }
+}
+
 // Implement row loop restoration for each thread.
 static int loop_restoration_row_worker(void *arg1, void *arg2) {
   AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
@@ -703,16 +802,39 @@
   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
   int lr_unit_row;
   int plane;
-  const int tile_row = LR_TILE_ROW;
-  const int tile_col = LR_TILE_COL;
-  const int tile_cols = LR_TILE_COLS;
-  const int tile_idx = tile_col + tile_row * tile_cols;
+  int plane_w;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex_ = lr_sync->job_mutex;
+#endif
+  struct aom_internal_error_info *const error_info = &lrworkerdata->error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(job_mutex_);
+    lr_sync->lr_mt_exit = true;
+    pthread_mutex_unlock(job_mutex_);
+#endif
+    // In case of loop restoration multithreading, the worker on an even lr
+    // block row waits for the completion of the filtering of the top-right and
+    // bottom-right blocks. Hence, in case a thread (main/worker) encounters an
+    // error, update that filtering of every row in the frame is complete in
+    // order to avoid the dependent workers from waiting indefinitely.
+    set_loop_restoration_done(lr_sync, lr_ctxt->ctxt);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
                            int vstart, int vend);
-  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
-                                         aom_yv12_partial_coloc_copy_u,
-                                         aom_yv12_partial_coloc_copy_v };
+  static const copy_fun copy_funs[MAX_MB_PLANE] = {
+    aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
+    aom_yv12_partial_coloc_copy_v
+  };
 
   while (1) {
     AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
@@ -724,7 +846,7 @@
       limits.v_end = cur_job_info->v_end;
       lr_unit_row = cur_job_info->lr_unit_row;
       plane = cur_job_info->plane;
-      const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+      plane_w = ctxt[plane].plane_w;
 
       // sync_mode == 1 implies only sync read is required in LR Multi-threading
       // sync_mode == 0 implies only sync write is required.
@@ -734,16 +856,14 @@
                                                    : av1_lr_sync_write_dummy;
 
       av1_foreach_rest_unit_in_row(
-          &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
-          ctxt[plane].rsi->restoration_unit_size, unit_idx0,
-          ctxt[plane].rsi->horz_units_per_tile,
-          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row,
+          ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units,
+          ctxt[plane].rsi->vert_units, plane, &ctxt[plane],
           lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
-          on_sync_write, lr_sync);
+          on_sync_write, lr_sync, error_info);
 
-      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
-                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
-                       cur_job_info->v_copy_end);
+      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w,
+                       cur_job_info->v_copy_start, cur_job_info->v_copy_end);
 
       if (lrworkerdata->do_extend_border) {
         aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane,
@@ -754,11 +874,37 @@
       break;
     }
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
+static AOM_INLINE void sync_lr_workers(AVxWorker *const workers,
+                                       AV1_COMMON *const cm, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int had_error = workers[0].had_error;
+  struct aom_internal_error_info error_info;
+
+  // Read the error_info of main thread.
+  if (had_error) {
+    AVxWorker *const worker = &workers[0];
+    error_info = ((LRWorkerData *)worker->data2)->error_info;
+  }
+
+  // Wait till all rows are finished.
+  for (int i = num_workers - 1; i > 0; --i) {
+    AVxWorker *const worker = &workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error_info = ((LRWorkerData *)worker->data2)->error_info;
+    }
+  }
+  if (had_error)
+    aom_internal_error(cm->error, error_info.error_code, "%s",
+                       error_info.detail);
+}
+
 static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
-                                           AVxWorker *workers, int nworkers,
+                                           AVxWorker *workers, int num_workers,
                                            AV1LrSync *lr_sync, AV1_COMMON *cm,
                                            int do_extend_border) {
   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
@@ -771,16 +917,12 @@
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
 
-    const PixelRect tile_rect = ctxt[plane].tile_rect;
-    const int max_tile_h = tile_rect.bottom - tile_rect.top;
-
+    const int plane_h = ctxt[plane].plane_h;
     const int unit_size = cm->rst_info[plane].restoration_unit_size;
 
-    num_rows_lr =
-        AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+    num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h));
   }
 
-  const int num_workers = nworkers;
   int i;
   assert(MAX_MB_PLANE == 3);
 
@@ -809,6 +951,7 @@
     worker->data2 = &lr_sync->lrworkerdata[i];
 
     // Start loop restoration
+    worker->had_error = 0;
     if (i == 0) {
       winterface->execute(worker);
     } else {
@@ -816,10 +959,7 @@
     }
   }
 
-  // Wait till all rows are finished
-  for (i = 1; i < num_workers; ++i) {
-    winterface->sync(&workers[i]);
-  }
+  sync_lr_workers(workers, cm, num_workers);
 }
 
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
@@ -852,6 +992,7 @@
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &workers[i];
+    worker->had_error = 0;
     if (i == 0)
       winterface->execute(worker);
     else
@@ -863,16 +1004,26 @@
                                          AV1_COMMON *const cm,
                                          int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  int had_error = 0;
+  int had_error = workers[0].had_error;
+  struct aom_internal_error_info error_info;
 
-  // Wait for completion of Cdef frame.
-  for (int i = num_workers - 1; i > 0; i--) {
+  // Read the error_info of main thread.
+  if (had_error) {
+    AVxWorker *const worker = &workers[0];
+    error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
+  }
+
+  // Wait till all rows are finished.
+  for (int i = num_workers - 1; i > 0; --i) {
     AVxWorker *const worker = &workers[i];
-    had_error |= !winterface->sync(worker);
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
+    }
   }
   if (had_error)
-    aom_internal_error(cm->error, AOM_CODEC_ERROR,
-                       "Failed to process cdef frame");
+    aom_internal_error(cm->error, error_info.error_code, "%s",
+                       error_info.detail);
 }
 
 // Updates the row index of the next job to be processed.
@@ -888,14 +1039,15 @@
 // Checks if a job is available. If job is available,
 // populates next job information and returns 1, else returns 0.
 static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
-                                            int *cur_fbr, const int nvfb) {
+                                            volatile int *cur_fbr,
+                                            const int nvfb) {
 #if CONFIG_MULTITHREAD
   pthread_mutex_lock(cdef_sync->mutex_);
 #endif  // CONFIG_MULTITHREAD
   int do_next_row = 0;
   // Populates information needed for current job and update the row
   // index of the next row to be processed.
-  if (cdef_sync->end_of_frame == 0) {
+  if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
     do_next_row = 1;
     *cur_fbr = cdef_sync->fbr;
     update_cdef_row_next_job_info(cdef_sync, nvfb);
@@ -906,19 +1058,49 @@
   return do_next_row;
 }
 
+static void set_cdef_init_fb_row_done(AV1CdefSync *const cdef_sync, int nvfb) {
+  for (int fbr = 0; fbr < nvfb; fbr++) cdef_row_mt_sync_write(cdef_sync, fbr);
+}
+
 // Hook function for each thread in CDEF multi-threading.
 static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
   AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
   AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
   AV1_COMMON *cm = cdef_worker->cm;
   const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int cur_fbr;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex_ = cdef_sync->mutex_;
+#endif
+  struct aom_internal_error_info *const error_info = &cdef_worker->error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(job_mutex_);
+    cdef_sync->cdef_mt_exit = true;
+    pthread_mutex_unlock(job_mutex_);
+#endif
+    // In case of cdef row-multithreading, the worker on a filter block row
+    // (fbr) waits for the line buffers (top and bottom) copy of the above row.
+    // Hence, in case a thread (main/worker) encounters an error before copying
+    // of the line buffers, update that line buffer copy is complete in order to
+    // avoid dependent workers waiting indefinitely.
+    set_cdef_init_fb_row_done(cdef_sync, nvfb);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  volatile int cur_fbr;
   const int num_planes = av1_num_planes(cm);
   while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
     MACROBLOCKD *xd = cdef_worker->xd;
     av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf,
                     cdef_worker->srcbuf, cur_fbr,
-                    cdef_worker->cdef_init_fb_row_fn, cdef_sync);
+                    cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info);
     if (cdef_worker->do_extend_border) {
       for (int plane = 0; plane < num_planes; ++plane) {
         const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
@@ -932,6 +1114,7 @@
       }
     }
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index b6485c3..6d695e8 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -54,6 +54,10 @@
   AV1LfMTInfo *job_queue;
   int jobs_enqueued;
   int jobs_dequeued;
+
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool lf_mt_exit;
 } AV1LfSync;
 
 typedef struct AV1LrMTInfo {
@@ -71,6 +75,7 @@
   void *rlbs;
   void *lr_ctxt;
   int do_extend_border;
+  struct aom_internal_error_info error_info;
 } LRWorkerData;
 
 // Looprestoration row synchronization
@@ -98,6 +103,9 @@
   AV1LrMTInfo *job_queue;
   int jobs_enqueued;
   int jobs_dequeued;
+  // Initialized to false, set to true by the worker thread that encounters
+  // an error in order to abort the processing of other worker threads.
+  bool lr_mt_exit;
 } AV1LrSync;
 
 typedef struct AV1CdefWorker {
@@ -108,6 +116,7 @@
   uint16_t *linebuf[MAX_MB_PLANE];
   cdef_init_fb_row_t cdef_init_fb_row_fn;
   int do_extend_border;
+  struct aom_internal_error_info error_info;
 } AV1CdefWorkerData;
 
 typedef struct AV1CdefRowSync {
@@ -132,6 +141,9 @@
   int fbr;
   // Column index in units of 64x64 block
   int fbc;
+  // Initialized to false, set to true by the worker thread that encounters
+  // an error in order to abort the processing of other worker threads.
+  bool cdef_mt_exit;
 } AV1CdefSync;
 
 void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
@@ -164,6 +176,9 @@
 void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
                            int width, int num_workers);
 
+void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
+                                   int num_mis_in_lpf_unit_height_log2);
+
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                               struct macroblockd *xd, int plane_start,
                               int plane_end, int partial_frame,
@@ -185,11 +200,11 @@
     const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
     struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
     int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+    struct aom_internal_error_info *error_info,
     AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2);
 
-static AOM_FORCE_INLINE bool skip_loop_filter_plane(const int planes_to_lf[3],
-                                                    int plane,
-                                                    int lpf_opt_level) {
+static AOM_FORCE_INLINE bool skip_loop_filter_plane(
+    const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) {
   // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
   // chroma planes together
   if (lpf_opt_level == 2) {
@@ -212,7 +227,7 @@
 }
 
 static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
-                                       const int planes_to_lf[3],
+                                       const int planes_to_lf[MAX_MB_PLANE],
                                        int lpf_opt_level,
                                        int num_mis_in_lpf_unit_height) {
   int mi_row, plane, dir;
@@ -225,7 +240,7 @@
   // partially reconstructed row by row.
   for (dir = 0; dir < 2; ++dir) {
     for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) {
-      for (plane = 0; plane < 3; ++plane) {
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
           continue;
         }
@@ -242,9 +257,9 @@
 }
 
 static AOM_INLINE void loop_filter_frame_mt_init(
-    AV1_COMMON *cm, int start_mi_row, int end_mi_row, const int planes_to_lf[3],
-    int num_workers, AV1LfSync *lf_sync, int lpf_opt_level,
-    int num_mis_in_lpf_unit_height_log2) {
+    AV1_COMMON *cm, int start_mi_row, int end_mi_row,
+    const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync,
+    int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) {
   // Number of superblock rows
   const int sb_rows =
       CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
@@ -271,7 +286,7 @@
 #if CONFIG_MULTITHREAD
   pthread_mutex_lock(lf_sync->job_mutex);
 
-  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+  if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
     cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
     lf_sync->jobs_dequeued++;
   }
diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c
index 508fe30..b964f25 100644
--- a/av1/common/tile_common.c
+++ b/av1/common/tile_common.c
@@ -167,12 +167,12 @@
   assert(tile->mi_col_end > tile->mi_col_start);
 }
 
-int av1_get_sb_rows_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
+int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
   return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start,
                            cm->seq_params->mib_size_log2);
 }
 
-int av1_get_sb_cols_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
+int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
   return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start,
                            cm->seq_params->mib_size_log2);
 }
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index 8615a2c..5383ae9 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h
@@ -40,8 +40,8 @@
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
 
-int av1_get_sb_rows_in_tile(struct AV1Common *cm, const TileInfo *tile);
-int av1_get_sb_cols_in_tile(struct AV1Common *cm, const TileInfo *tile);
+int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile);
+int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile);
 
 // Return the pixel extents of the given tile
 PixelRect av1_get_tile_rect(const TileInfo *tile_info,
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 83f410e..4282b92 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -17,6 +17,7 @@
 
 #include "config/av1_rtcd.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
@@ -214,10 +215,42 @@
     return 1;
 }
 
+#ifndef NDEBUG
+// Check that the given warp model satisfies the relevant constraints for
+// its stated model type
+static void check_model_consistency(WarpedMotionParams *wm) {
+  switch (wm->wmtype) {
+    case IDENTITY:
+      assert(wm->wmmat[0] == 0);
+      assert(wm->wmmat[1] == 0);
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
+      assert(wm->wmmat[3] == 0);
+      AOM_FALLTHROUGH_INTENDED;
+    case ROTZOOM:
+      assert(wm->wmmat[4] == -wm->wmmat[3]);
+      assert(wm->wmmat[5] == wm->wmmat[2]);
+      AOM_FALLTHROUGH_INTENDED;
+    case AFFINE: break;
+    default: assert(0 && "Bad wmtype");
+  }
+}
+#endif  // NDEBUG
+
 // Returns 1 on success or 0 on an invalid affine set
 int av1_get_shear_params(WarpedMotionParams *wm) {
+#ifndef NDEBUG
+  // Check that models have been constructed sensibly
+  // This is a good place to check, because this function does not need to
+  // be called until after model construction is complete, but must be called
+  // before the model can be used for prediction.
+  check_model_consistency(wm);
+#endif  // NDEBUG
+
   const int32_t *mat = wm->wmmat;
   if (!is_affine_valid(wm)) return 0;
+
   wm->alpha =
       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
@@ -247,17 +280,6 @@
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static INLINE int highbd_error_measure(int err, int bd) {
-  const int b = bd - 8;
-  const int bmask = (1 << b) - 1;
-  const int v = (1 << b);
-  err = abs(err);
-  const int e1 = err >> b;
-  const int e2 = err & bmask;
-  return error_measure_lut[255 + e1] * (v - e2) +
-         error_measure_lut[256 + e1] * e2;
-}
-
 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
     for hardware implementations, see the comments above av1_warp_affine_c
 */
@@ -269,9 +291,7 @@
                               ConvolveParams *conv_params, int16_t alpha,
                               int16_t beta, int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
-  const int reduce_bits_horiz =
-      conv_params->round_0 +
-      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_horiz = conv_params->round_0;
   const int reduce_bits_vert = conv_params->is_compound
                                    ? conv_params->round_1
                                    : 2 * FILTER_BITS - reduce_bits_horiz;
@@ -284,6 +304,10 @@
   (void)max_bits_horiz;
   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
   for (int i = p_row; i < p_row + p_height; i += 8) {
     for (int j = p_col; j < p_col + p_width; j += 8) {
       // Calculate the center of this 8x8 block,
@@ -392,11 +416,6 @@
                        int p_col, int p_row, int p_width, int p_height,
                        int p_stride, int subsampling_x, int subsampling_y,
                        int bd, ConvolveParams *conv_params) {
-  assert(wm->wmtype <= AFFINE);
-  if (wm->wmtype == ROTZOOM) {
-    wm->wmmat[5] = wm->wmmat[2];
-    wm->wmmat[4] = -wm->wmmat[3];
-  }
   const int32_t *const mat = wm->wmmat;
   const int16_t alpha = wm->alpha;
   const int16_t beta = wm->beta;
@@ -408,46 +427,6 @@
                          subsampling_y, bd, conv_params, alpha, beta, gamma,
                          delta);
 }
-
-int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
-                                    const uint16_t *const dst, int p_width,
-                                    int p_height, int p_stride, int bd) {
-  int64_t sum_error = 0;
-  for (int i = 0; i < p_height; ++i) {
-    for (int j = 0; j < p_width; ++j) {
-      sum_error +=
-          highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
-    }
-  }
-  return sum_error;
-}
-
-static int64_t highbd_segmented_frame_error(
-    const uint16_t *const ref, int stride, const uint16_t *const dst,
-    int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
-    int segment_map_stride) {
-  int patch_w, patch_h;
-  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  int64_t sum_error = 0;
-  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
-      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
-      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
-      // Only compute the error if this block contains inliers from the motion
-      // model
-      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
-
-      // avoid computing error into the frame padding
-      patch_w = AOMMIN(error_bsize_w, p_width - j);
-      patch_h = AOMMIN(error_bsize_h, p_height - i);
-      sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
-                                               dst + j + i * p_stride, patch_w,
-                                               patch_h, p_stride, bd);
-    }
-  }
-  return sum_error;
-}
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* The warp filter for ROTZOOM and AFFINE models works as follows:
@@ -669,11 +648,6 @@
                 int height, int stride, uint8_t *pred, int p_col, int p_row,
                 int p_width, int p_height, int p_stride, int subsampling_x,
                 int subsampling_y, ConvolveParams *conv_params) {
-  assert(wm->wmtype <= AFFINE);
-  if (wm->wmtype == ROTZOOM) {
-    wm->wmmat[5] = wm->wmmat[2];
-    wm->wmmat[4] = -wm->wmmat[3];
-  }
   const int32_t *const mat = wm->wmmat;
   const int16_t alpha = wm->alpha;
   const int16_t beta = wm->beta;
@@ -684,79 +658,6 @@
                   alpha, beta, gamma, delta);
 }
 
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
-                               const uint8_t *const dst, int p_width,
-                               int p_height, int p_stride) {
-  int64_t sum_error = 0;
-  for (int i = 0; i < p_height; ++i) {
-    for (int j = 0; j < p_width; ++j) {
-      sum_error +=
-          (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
-    }
-  }
-  return sum_error;
-}
-
-static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
-                                     const uint8_t *const dst, int p_width,
-                                     int p_height, int p_stride,
-                                     uint8_t *segment_map,
-                                     int segment_map_stride) {
-  int patch_w, patch_h;
-  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  int64_t sum_error = 0;
-  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
-      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
-      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
-      // Only compute the error if this block contains inliers from the motion
-      // model
-      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
-
-      // avoid computing error into the frame padding
-      patch_w = AOMMIN(error_bsize_w, p_width - j);
-      patch_h = AOMMIN(error_bsize_h, p_height - i);
-      sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
-                                        dst + j + i * p_stride, patch_w,
-                                        patch_h, p_stride);
-    }
-  }
-  return sum_error;
-}
-
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
-                        uint8_t *dst, int p_width, int p_height, int p_stride) {
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (use_hbd) {
-    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
-                                       CONVERT_TO_SHORTPTR(dst), p_width,
-                                       p_height, p_stride, bd);
-  }
-#endif
-  (void)use_hbd;
-  (void)bd;
-  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
-}
-
-int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
-                                  int stride, uint8_t *dst, int p_width,
-                                  int p_height, int p_stride,
-                                  uint8_t *segment_map,
-                                  int segment_map_stride) {
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (use_hbd) {
-    return highbd_segmented_frame_error(
-        CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
-        p_height, p_stride, bd, segment_map, segment_map_stride);
-  }
-#endif
-  (void)use_hbd;
-  (void)bd;
-  return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
-                               segment_map, segment_map_stride);
-}
-
 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index d6fe325..d772df8 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -38,76 +38,6 @@
 DECLARE_ALIGNED(8, extern const int8_t,
                 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
 
-/* clang-format off */
-static const int error_measure_lut[512] = {
-    // pow 0.7
-    16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
-    16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
-    15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
-    15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
-    14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
-    14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
-    14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
-    13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
-    13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
-    12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
-    12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
-    12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
-    11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
-    11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
-    10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
-    10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
-    10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
-    9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
-    9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
-    8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
-    8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
-    7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
-    7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
-    6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
-    6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
-    5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
-    5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
-    4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
-    3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
-    3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
-    2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
-    1323, 1187, 1045,  894,  731,  550,  339,    0,
-    339,  550,  731,  894, 1045, 1187, 1323, 1452,
-    1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
-    2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
-    3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
-    3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
-    4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
-    5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
-    5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
-    6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
-    6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
-    7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
-    7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
-    8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
-    8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
-    9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
-    9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
-    10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
-    10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
-    11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
-    11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
-    11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
-    12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
-    12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
-    13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
-    13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
-    13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
-    14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
-    14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
-    15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
-    15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
-    15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
-    16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
 static const uint8_t warp_pad_left[14][16] = {
   { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -142,24 +72,6 @@
   { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 };
 
-static INLINE int error_measure(int err) {
-  return error_measure_lut[255 + err];
-}
-
-// Returns the error between the frame described by 'ref' and the frame
-// described by 'dst'.
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
-                        uint8_t *dst, int p_width, int p_height, int p_stride);
-
-int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
-                                  int stride, uint8_t *dst, int p_width,
-                                  int p_height, int p_stride,
-                                  uint8_t *segment_map, int segment_map_stride);
-
-int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
-                                    const uint16_t *const dst, int p_width,
-                                    int p_height, int p_stride, int bd);
-
 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
                        int width, int height, int stride, uint16_t *const pred,
                        int p_col, int p_row, int p_width, int p_height,
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 67b28bc..8e293b5 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -12,7 +12,7 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index de850ee..d65318c 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -12,7 +12,7 @@
 #include <immintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
diff --git a/av1/common/x86/highbd_convolve_2d_sse4.c b/av1/common/x86/highbd_convolve_2d_sse4.c
index b2c39cd..89d7199 100644
--- a/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -13,7 +13,7 @@
 #include <smmintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/highbd_convolve_2d_ssse3.c b/av1/common/x86/highbd_convolve_2d_ssse3.c
index 8324044..88974ba 100644
--- a/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index da52ecd..6dcac10 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -12,7 +12,7 @@
 #include <immintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
diff --git a/av1/common/x86/highbd_jnt_convolve_sse4.c b/av1/common/x86/highbd_jnt_convolve_sse4.c
index af45764..5a7fc53 100644
--- a/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -12,7 +12,7 @@
 #include <smmintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c
index 7f6aceb..75108b4 100644
--- a/av1/common/x86/highbd_warp_affine_avx2.c
+++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -22,9 +22,7 @@
                                  ConvolveParams *conv_params, int16_t alpha,
                                  int16_t beta, int16_t gamma, int16_t delta) {
   __m256i tmp[15];
-  const int reduce_bits_horiz =
-      conv_params->round_0 +
-      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_horiz = conv_params->round_0;
   const int reduce_bits_vert = conv_params->is_compound
                                    ? conv_params->round_1
                                    : 2 * FILTER_BITS - reduce_bits_horiz;
@@ -37,6 +35,10 @@
   (void)max_bits_horiz;
   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
   const __m256i clip_pixel =
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 9df0ddc..96fb4cf 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -302,9 +302,7 @@
                                    int16_t beta, int16_t gamma, int16_t delta) {
   __m128i tmp[15];
   int i, j, k;
-  const int reduce_bits_horiz =
-      conv_params->round_0 +
-      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_horiz = conv_params->round_0;
   const int reduce_bits_vert = conv_params->is_compound
                                    ? conv_params->round_1
                                    : 2 * FILTER_BITS - reduce_bits_horiz;
@@ -313,6 +311,10 @@
   assert(!(bd == 12 && reduce_bits_horiz < 5));
   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
   const __m128i clip_pixel =
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
diff --git a/av1/common/x86/highbd_wiener_convolve_avx2.c b/av1/common/x86/highbd_wiener_convolve_avx2.c
index 0c8a850..562c623 100644
--- a/av1/common/x86/highbd_wiener_convolve_avx2.c
+++ b/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -12,7 +12,7 @@
 #include <immintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -29,7 +29,7 @@
     const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
     const int16_t *filter_y, int y_step_q4, int w, int h,
-    const ConvolveParams *conv_params, int bd) {
+    const WienerConvolveParams *conv_params, int bd) {
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
diff --git a/av1/common/x86/highbd_wiener_convolve_ssse3.c b/av1/common/x86/highbd_wiener_convolve_ssse3.c
index 818b109..cab37fa 100644
--- a/av1/common/x86/highbd_wiener_convolve_ssse3.c
+++ b/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 #include <assert.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -22,7 +22,7 @@
     const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
     const int16_t *filter_y, int y_step_q4, int w, int h,
-    const ConvolveParams *conv_params, int bd) {
+    const WienerConvolveParams *conv_params, int bd) {
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
diff --git a/av1/common/x86/intra_edge_sse4.c b/av1/common/x86/intra_edge_sse4.c
index f025f79..3eee46f 100644
--- a/av1/common/x86/intra_edge_sse4.c
+++ b/av1/common/x86/intra_edge_sse4.c
@@ -113,7 +113,69 @@
   }
 }
 
-void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+  };
+
+  DECLARE_ALIGNED(
+      16, static const int8_t,
+      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint8_t *in = &p[-2];
+  uint8_t *out = &p[-2];
+
+  int n = sz + 1;  // Input length including upper-left sample
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+  while (n > 0) {
+    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+    d0 = _mm_maddubs_epi16(d0, coef0);
+    d1 = _mm_maddubs_epi16(d1, coef0);
+    d2 = _mm_maddubs_epi16(d2, coef0);
+    d3 = _mm_maddubs_epi16(d3, coef0);
+    d0 = _mm_hadd_epi16(d0, d1);
+    d2 = _mm_hadd_epi16(d2, d3);
+    __m128i eight = _mm_set1_epi16(8);
+    d0 = _mm_add_epi16(d0, eight);
+    d2 = _mm_add_epi16(d2, eight);
+    d0 = _mm_srai_epi16(d0, 4);
+    d2 = _mm_srai_epi16(d2, 4);
+    d0 = _mm_packus_epi16(d0, d2);
+    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[16], out1);
+    in0 = in16;
+    in16 = _mm_setzero_si128();
+    out += 32;
+    n -= 16;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
   DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
@@ -204,67 +266,7 @@
   }
 }
 
-void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
-  // interpolate half-sample positions
-  assert(sz <= 24);
-
-  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
-    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
-  };
-
-  DECLARE_ALIGNED(
-      16, static const int8_t,
-      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
-                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
-
-  // Extend first/last samples (upper-left p[-1], last p[sz-1])
-  // to support 4-tap filter
-  p[-2] = p[-1];
-  p[sz] = p[sz - 1];
-
-  uint8_t *in = &p[-2];
-  uint8_t *out = &p[-2];
-
-  int n = sz + 1;  // Input length including upper-left sample
-
-  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
-  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
-
-  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
-  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
-  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
-
-  while (n > 0) {
-    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
-    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
-    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
-    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
-    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
-    d0 = _mm_maddubs_epi16(d0, coef0);
-    d1 = _mm_maddubs_epi16(d1, coef0);
-    d2 = _mm_maddubs_epi16(d2, coef0);
-    d3 = _mm_maddubs_epi16(d3, coef0);
-    d0 = _mm_hadd_epi16(d0, d1);
-    d2 = _mm_hadd_epi16(d2, d3);
-    __m128i eight = _mm_set1_epi16(8);
-    d0 = _mm_add_epi16(d0, eight);
-    d2 = _mm_add_epi16(d2, eight);
-    d0 = _mm_srai_epi16(d0, 4);
-    d2 = _mm_srai_epi16(d2, 4);
-    d0 = _mm_packus_epi16(d0, d2);
-    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
-    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
-    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
-    _mm_storeu_si128((__m128i *)&out[0], out0);
-    _mm_storeu_si128((__m128i *)&out[16], out1);
-    in0 = in16;
-    in16 = _mm_setzero_si128();
-    out += 32;
-    n -= 16;
-  }
-}
-
-void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
+void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
   // interpolate half-sample positions
   assert(sz <= 24);
 
@@ -316,3 +318,5 @@
     n -= 8;
   }
 }
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index ae8f88e..9f82ed2 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 #include <immintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index ab937f9..8c5d9918f 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -11,7 +11,7 @@
 
 #include <emmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/av1/common/x86/jnt_convolve_ssse3.c b/av1/common/x86/jnt_convolve_ssse3.c
index d0cf763..f6bf678 100644
--- a/av1/common/x86/jnt_convolve_ssse3.c
+++ b/av1/common/x86/jnt_convolve_ssse3.c
@@ -11,7 +11,7 @@
 
 #include <tmmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index 95814b4..eb4a4d1 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c
@@ -15,6 +15,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 #include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
 
 static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
                                 const __m128i s1) {
diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 4ab35e8..5ab6c46 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c
@@ -630,18 +630,17 @@
   return 0;
 }
 
-void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
-                                           int height, int stride, int eps,
-                                           const int *xqd, uint8_t *dst8,
-                                           int dst_stride, int32_t *tmpbuf,
-                                           int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                          int height, int stride, int eps,
+                                          const int *xqd, uint8_t *dst8,
+                                          int dst_stride, int32_t *tmpbuf,
+                                          int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
   const int ret = av1_selfguided_restoration_avx2(
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
-  (void)ret;
-  assert(!ret);
+  if (ret != 0) return ret;
   const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
   av1_decode_xq(xqd, xq, params);
@@ -721,4 +720,5 @@
       }
     }
   }
+  return 0;
 }
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 948bbfb..ac850f5 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -582,18 +582,17 @@
   return 0;
 }
 
-void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
-                                             int height, int stride, int eps,
-                                             const int *xqd, uint8_t *dst8,
-                                             int dst_stride, int32_t *tmpbuf,
-                                             int bit_depth, int highbd) {
+int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                            int height, int stride, int eps,
+                                            const int *xqd, uint8_t *dst8,
+                                            int dst_stride, int32_t *tmpbuf,
+                                            int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
   const int ret = av1_selfguided_restoration_sse4_1(
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
-  (void)ret;
-  assert(!ret);
+  if (ret != 0) return ret;
   const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
   av1_decode_xq(xqd, xq, params);
@@ -659,4 +658,5 @@
       }
     }
   }
+  return 0;
 }
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index ceb836e..663b8cd 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -1022,116 +1022,6 @@
                                 shuffle_src);
 }
 
-int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int p_width,
-                                  int p_height, int dst_stride) {
-  int64_t sum_error = 0;
-  int i, j;
-  __m256i row_error, col_error;
-  __m256i zero = _mm256_setzero_si256();
-  __m256i dup_255 = _mm256_set1_epi16(255);
-  col_error = zero;
-
-  for (i = 0; i < (p_height / 4); i++) {
-    row_error = _mm256_setzero_si256();
-    for (j = 0; j < (p_width / 16); j++) {
-      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
-      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
-      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
-      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
-      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
-      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
-      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
-      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
-
-      __m256i diff_1 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
-      __m256i diff_2 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
-      __m256i diff_3 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
-      __m256i diff_4 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
-
-      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
-      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
-      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
-      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
-      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
-      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
-      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
-      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
-
-      __m256i error_1_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
-      __m256i error_1_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
-      __m256i error_2_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
-      __m256i error_2_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
-      __m256i error_3_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
-      __m256i error_3_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
-      __m256i error_4_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
-      __m256i error_4_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
-
-      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
-      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
-      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
-      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
-
-      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
-      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
-
-      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
-      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
-    }
-    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
-    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
-    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm256_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int k = 0; k < 4; ++k) {
-        for (int l = j * 16; l < p_width; ++l) {
-          sum_error +=
-              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
-                                     ref[l + ((i * 4) + k) * ref_stride]);
-        }
-      }
-    }
-  }
-  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
-  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
-  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, sum_error_q_0);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  // Error summation for remaining height, which is not multiple of 4
-  if (p_height & 0x3) {
-    for (int k = i * 4; k < p_height; ++k) {
-      for (int l = 0; l < p_width; ++l) {
-        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
-                                            ref[l + k * ref_stride]);
-      }
-    }
-  }
-  return sum_error;
-}
-
 void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
                           int height, int stride, uint8_t *pred, int p_col,
                           int p_row, int p_width, int p_height, int p_stride,
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
deleted file mode 100644
index f8fe578..0000000
--- a/av1/common/x86/warp_plane_sse2.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "av1/common/warped_motion.h"
-#include "config/av1_rtcd.h"
-
-int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int p_width,
-                                  int p_height, int dst_stride) {
-  int64_t sum_error = 0;
-  int i, j;
-  __m128i row_error, col_error;
-  __m128i zero = _mm_setzero_si128();
-  __m128i dup_255 = _mm_set1_epi16(255);
-  col_error = zero;
-  for (i = 0; i < (p_height); i++) {
-    row_error = zero;
-    for (j = 0; j < (p_width / 16); j++) {
-      __m128i ref_8 =
-          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
-      __m128i dst_8 =
-          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
-      __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
-      __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
-      __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
-      __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
-
-      __m128i diff_1 =
-          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
-      __m128i diff_2 =
-          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
-
-      __m128i error_1_lo =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
-      __m128i error_1_hi =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
-      __m128i error_2_lo =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
-      __m128i error_2_hi =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
-
-      __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
-      __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
-      __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
-
-      row_error = _mm_add_epi32(row_error, error_1_2);
-    }
-    __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
-    __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
-    __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int l = j * 16; l < p_width; ++l) {
-        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
-                                            ref[l + i * ref_stride]);
-      }
-    }
-  }
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, col_error);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  return sum_error;
-}
diff --git a/av1/common/x86/wiener_convolve_avx2.c b/av1/common/x86/wiener_convolve_avx2.c
index b7ac683..3de630f 100644
--- a/av1/common/x86/wiener_convolve_avx2.c
+++ b/av1/common/x86/wiener_convolve_avx2.c
@@ -45,7 +45,7 @@
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h,
-                                      const ConvolveParams *conv_params) {
+                                      const WienerConvolveParams *conv_params) {
   const int bd = 8;
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
diff --git a/av1/common/x86/wiener_convolve_sse2.c b/av1/common/x86/wiener_convolve_sse2.c
index f9d00b7..1c039e8 100644
--- a/av1/common/x86/wiener_convolve_sse2.c
+++ b/av1/common/x86/wiener_convolve_sse2.c
@@ -23,7 +23,7 @@
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h,
-                                      const ConvolveParams *conv_params) {
+                                      const WienerConvolveParams *conv_params) {
   const int bd = 8;
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 5b76de8..e3cce40 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1275,9 +1275,13 @@
     const int num_planes = av1_num_planes(cm);
     for (int plane = 0; plane < num_planes; ++plane) {
       int rcol0, rcol1, rrow0, rrow1;
+
+      // Skip some unnecessary work if loop restoration is disabled
+      if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
       if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
                                              &rcol0, &rcol1, &rrow0, &rrow1)) {
-        const int rstride = cm->rst_info[plane].horz_units_per_tile;
+        const int rstride = cm->rst_info[plane].horz_units;
         for (int rrow = rrow0; rrow < rrow1; ++rrow) {
           for (int rcol = rcol0; rcol < rcol1; ++rcol) {
             const int runit_idx = rcol + rrow * rstride;
@@ -4326,7 +4330,6 @@
                        trans_dec_factor;
   }
 
-  assert(params->wmtype <= AFFINE);
   int good_shear_params = av1_get_shear_params(params);
   if (!good_shear_params) return 0;
 
@@ -5219,6 +5222,9 @@
       cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true);
+    for (int p = 0; p < av1_num_planes(cm); p++) {
+      av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+    }
   }
 
   const int use_highbd = cm->seq_params->use_highbitdepth;
@@ -5238,6 +5244,7 @@
   MACROBLOCKD *const xd = &pbi->dcb.xd;
   const int tile_count_tg = end_tile - start_tile + 1;
 
+  xd->error_info = cm->error;
   if (initialize_flag) setup_frame_info(pbi);
   const int num_planes = av1_num_planes(cm);
 
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 5f114f9..bb0ccf5f 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -311,7 +311,7 @@
 }
 
 static int read_intra_segment_id(AV1_COMMON *const cm,
-                                 const MACROBLOCKD *const xd, int bsize,
+                                 const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
                                  aom_reader *r, int skip) {
   struct segmentation *const seg = &cm->seg;
   if (!seg->enabled) return 0;  // Default for disabled segmentation
@@ -825,13 +825,13 @@
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
     }
+    const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
     mbmi->angle_delta[PLANE_TYPE_UV] =
-        (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
-            ? read_angle_delta(r,
-                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+        (use_angle_delta && av1_is_directional_mode(intra_mode))
+            ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
             : 0;
   } else {
-    // Avoid decoding angle_info if there is is no chroma prediction
+    // Avoid decoding angle_info if there is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
   }
   xd->cfl.store_y = store_cfl_required(cm, xd);
@@ -1086,13 +1086,13 @@
       mbmi->cfl_alpha_idx =
           read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
     }
+    const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
     mbmi->angle_delta[PLANE_TYPE_UV] =
-        use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
-            ? read_angle_delta(r,
-                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+        use_angle_delta && av1_is_directional_mode(intra_mode)
+            ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
             : 0;
   } else {
-    // Avoid decoding angle_info if there is is no chroma prediction
+    // Avoid decoding angle_info if there is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
   }
   xd->cfl.store_y = store_cfl_required(cm, xd);
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index 236b296..8dcef5f 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -29,6 +29,26 @@
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/rdopt_utils.h"
 
+#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128
+#define MB_WIENER_PRED_BUF_STRIDE 128
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) {
+  const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd);
+  assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL);
+  const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE];
+  const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE];
+  assert(buf_width == MB_WIENER_PRED_BUF_STRIDE);
+  const size_t buf_size =
+      (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf))
+      << is_high_bitdepth;
+  CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size));
+}
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) {
+  aom_free(td->wiener_tmp_pred_buf);
+  td->wiener_tmp_pred_buf = NULL;
+}
+
 void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
 
@@ -236,7 +256,7 @@
                                 int16_t *src_diff, tran_low_t *coeff,
                                 tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                 double *sum_rec_distortion,
-                                double *sum_est_rate) {
+                                double *sum_est_rate, uint8_t *pred_buffer) {
   AV1_COMMON *const cm = &cpi->common;
   uint8_t *buffer = cpi->source->y_buffer;
   int buf_stride = cpi->source->y_stride;
@@ -250,27 +270,21 @@
   const int coeff_count = block_size * block_size;
   const int mb_step = mi_size_wide[bsize];
   const BitDepthInfo bd_info = get_bit_depth_info(xd);
-  const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt;
+  const MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
   AV1EncRowMultiThreadSync *const intra_row_mt_sync =
       &cpi->ppi->intra_row_mt_sync;
   const int mi_cols = cm->mi_params.mi_cols;
   const int mt_thread_id = mi_row / mb_step;
   // TODO(chengchen): test different unit step size
-  const int mt_unit_step = mi_size_wide[BLOCK_64X64];
+  const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
   const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
   int mt_unit_col = 0;
   const int is_high_bitdepth = is_cur_buf_hbd(xd);
 
-  // We use a scratch buffer to store the prediction.
-  // The stride is the max block size (128).
-  uint8_t *pred_buffer;
-  const int dst_buffer_stride = 128;
-  const int buf_width = 128;
-  const int buf_height = 128;
-  const size_t buf_size = (buf_width * buf_height * sizeof(*pred_buffer))
-                          << is_high_bitdepth;
-  CHECK_MEM_ERROR(cm, pred_buffer, aom_memalign(32, buf_size));
   uint8_t *dst_buffer = pred_buffer;
+  const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE;
+
   if (is_high_bitdepth) {
     uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer;
     dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16);
@@ -280,6 +294,18 @@
     if (mi_col % mt_unit_step == 0) {
       intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id,
                                     mt_unit_col);
+#if CONFIG_MULTITHREAD
+      const int num_workers =
+          AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+      if (num_workers > 1) {
+        const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+        pthread_mutex_lock(enc_row_mt->mutex_);
+        const bool exit = enc_row_mt->mb_wiener_mt_exit;
+        pthread_mutex_unlock(enc_row_mt->mutex_);
+        // Stop further processing in case any worker has encountered an error.
+        if (exit) break;
+      }
+#endif
     }
 
     PREDICTION_MODE best_mode = DC_PRED;
@@ -434,7 +460,6 @@
   }
   // Set the pointer to null since mbmi is only allocated inside this function.
   xd->mi = NULL;
-  aom_free(pred_buffer);
 }
 
 static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
@@ -449,7 +474,8 @@
   DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
   for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
     av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
-                               dqcoeff, sum_rec_distortion, sum_est_rate);
+                               dqcoeff, sum_rec_distortion, sum_est_rate,
+                               cpi->td.wiener_tmp_pred_buf);
   }
 }
 
@@ -565,6 +591,7 @@
           NULL, cpi->image_pyramid_levels, 0))
     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
+  av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td);
   cpi->norm_wiener_variance = 0;
 
   MACROBLOCK *x = &cpi->td.mb;
@@ -647,6 +674,7 @@
   // Set the pointer to null since mbmi is only allocated inside this function.
   xd->mi = NULL;
   aom_free_frame_buffer(&cm->cur_frame->buf);
+  av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
 }
 
 static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize,
diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h
index 9e10566..0d34ce0 100644
--- a/av1/encoder/allintra_vis.h
+++ b/av1/encoder/allintra_vis.h
@@ -20,6 +20,8 @@
 #include "av1/encoder/block.h"
 #include "av1/encoder/encoder.h"
 
+#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64
+
 void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
 
 void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
@@ -27,7 +29,7 @@
                                 int16_t *src_diff, tran_low_t *coeff,
                                 tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                 double *sum_rec_distortion,
-                                double *sum_est_rate);
+                                double *sum_est_rate, uint8_t *pred_buffer);
 
 void av1_set_mb_wiener_variance(AV1_COMP *cpi);
 
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index be51ba1..f48ff11 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -48,7 +48,8 @@
 // mode, and rate/distortion.
 static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
                                 const MB_MODE_INFO *mbmi, int64_t rate,
-                                int64_t dist, int bsize, int noise_level) {
+                                int64_t dist, BLOCK_SIZE bsize,
+                                int noise_level) {
   MV mv = mbmi->mv[0].as_mv;
   int is_compound = has_second_ref(mbmi);
   // Reject the block for lower-qp coding for non-compound mode if
@@ -642,11 +643,15 @@
 
 int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  // TODO(marpan): Tune these conditons, add QP dependence.
-  if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad) return 1;
+  const int qindex = cpi->common.quant_params.base_qindex;
   if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
       cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
-      cpi->rc.frame_source_sad < 1000)
+      cpi->rc.frame_source_sad < 1000 &&
+      qindex < 7 * (cpi->rc.worst_quality >> 3))
+    return 1;
+  // More aggressive skip.
+  else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad &&
+           cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality)
     return 1;
   return 0;
 }
diff --git a/av1/encoder/arm/crc32/hash_crc32.c b/av1/encoder/arm/crc32/hash_arm_crc32.c
similarity index 95%
rename from av1/encoder/arm/crc32/hash_crc32.c
rename to av1/encoder/arm/crc32/hash_arm_crc32.c
index 771496c..91fc1e0 100644
--- a/av1/encoder/arm/crc32/hash_crc32.c
+++ b/av1/encoder/arm/crc32/hash_arm_crc32.c
@@ -9,9 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <stdint.h>
-#include <stddef.h>
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#else
 #include <arm_acle.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
 
 #include "config/aom_config.h"
 
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index ee8b115..a17a41a 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -12,792 +12,597 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "aom_dsp/txfm_common.h"
 #include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
 #include "aom_ports/mem.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
 
-#define custom_packs_s32(w0, w1) vcombine_s16(vqmovn_s32(w0), vqmovn_s32(w1))
+#define TXFM_COS_BIT_MAX 13
 
-static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
-                                       int16x8_t *const out) {
-#if AOM_ARCH_AARCH64
-  const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
-  const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
-#else
-  int16x4x2_t temp;
-  temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
-  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
-  temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
-  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
-#endif
+// A note on butterfly helper naming:
+//
+// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
+// e.g. butterfly_s32_s32_x4_0231_neon
+//                |   |   |  ^ Weights are applied as indices 0, 2, 3, 1
+//                |   |   |    (see more detail below)
+//                |   |   ^ (int32)x4 input/output parameters
+//                |   ^ 32-bit accumulators internally
+//                ^ 32-bit input/output parameters
+//
+// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
+// avoid needing separate negation instructions. This is represented in the
+// helper naming by referring to the lane index in the loaded tuple that each
+// multiply is performed with:
+//
+//        in0  in1
+//      /----------
+// out0 |  w0   w1   ==>  out0 = in0 * w0 + in1 * w1
+// out1 |  w2   w3   ==>  out1 = in0 * w2 + in1 * w3
+//
+// So for indices 0331 from the earlier example, we end up with:
+//
+//          in0       in1
+//      /------------------
+// out0 | (lane 0) (lane 2)   ==>  out0 = in0 *   w0   + in1 *  -w0
+// out1 | (lane 3) (lane 1)   ==>  out1 = in0 * (w0-1) + in1 * (1-w0)
 
-  int32x4x2_t a01 =
-      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
-  out[0] = vreinterpretq_s16_s32(a01.val[0]);
-  out[1] = vextq_s16(vreinterpretq_s16_s32(a01.val[0]), out[1], 4);
-  out[2] = vreinterpretq_s16_s32(a01.val[1]);
-  out[3] = vextq_s16(vreinterpretq_s16_s32(a01.val[1]), out[3], 4);
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+  o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
 }
 
-static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
-                                       int16x8_t *const out) {
-#if AOM_ARCH_AARCH64
-  const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
-  const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
-  const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
-  const int16x8_t a3 = vzip1q_s16(in[6], in[7]);
-#else
-  int16x4x2_t temp;
-  temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
-  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
-  temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
-  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
-  temp = vzip_s16(vget_low_s16(in[4]), vget_low_s16(in[5]));
-  const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
-  temp = vzip_s16(vget_low_s16(in[6]), vget_low_s16(in[7]));
-  const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
-#endif
-
-  const int32x4x2_t b02 =
-      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
-  const int32x4x2_t b13 =
-      vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
-
-#if AOM_ARCH_AARCH64
-  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
-                                            vreinterpretq_s64_s32(b13.val[0])));
-  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
-                                            vreinterpretq_s64_s32(b13.val[0])));
-  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
-                                            vreinterpretq_s64_s32(b13.val[1])));
-  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
-                                            vreinterpretq_s64_s32(b13.val[1])));
-#else
-  out[0] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
-  out[2] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
-  out[1] = vreinterpretq_s16_s32(
-      vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
-  out[3] = vreinterpretq_s16_s32(
-      vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
-#endif
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+  o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
 }
 
-static INLINE void transpose_16bit_8x4(const int16x8_t *const in,
-                                       int16x8_t *const out) {
-  const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
-  const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
-
-  const int32x4x2_t b01 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
-                                    vreinterpretq_s32_s16(a15.val[0]));
-  const int32x4x2_t b45 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
-                                    vreinterpretq_s32_s16(a15.val[1]));
-
-  const int32x4_t zeros = vdupq_n_s32(0);
-
-#if AOM_ARCH_AARCH64
-  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[1]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[1]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[0]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[0]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[1]),
-                                            vreinterpretq_s64_s32(zeros)));
-  out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[1]),
-                                            vreinterpretq_s64_s32(zeros)));
-#else
-  out[0] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b01.val[0], b01.val[0], 2), zeros, 2));
-  out[1] = vreinterpretq_s16_s32(vextq_s32(b01.val[0], zeros, 2));
-  out[2] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b01.val[1], b01.val[1], 2), zeros, 2));
-  out[3] = vreinterpretq_s16_s32(vextq_s32(b01.val[1], zeros, 2));
-  out[4] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b45.val[0], b45.val[0], 2), zeros, 2));
-  out[5] = vreinterpretq_s16_s32(vextq_s32(b45.val[0], zeros, 2));
-  out[6] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b45.val[1], b45.val[1], 2), zeros, 2));
-  out[7] = vreinterpretq_s16_s32(vextq_s32(b45.val[1], zeros, 2));
-#endif
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+  o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
 }
 
-static INLINE void transpose_16bit_8x8(const int16x8_t *const in,
-                                       int16x8_t *const out) {
-  const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
-  const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
-  const int16x8x2_t a26 = vzipq_s16(in[4], in[5]);
-  const int16x8x2_t a37 = vzipq_s16(in[6], in[7]);
-
-  const int32x4x2_t b04 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
-                                    vreinterpretq_s32_s16(a15.val[0]));
-  const int32x4x2_t b15 = vzipq_s32(vreinterpretq_s32_s16(a26.val[0]),
-                                    vreinterpretq_s32_s16(a37.val[0]));
-  const int32x4x2_t b26 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
-                                    vreinterpretq_s32_s16(a15.val[1]));
-  const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
-                                    vreinterpretq_s32_s16(a37.val[1]));
-
-#if AOM_ARCH_AARCH64
-  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
-                                            vreinterpretq_s64_s32(b15.val[0])));
-  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
-                                            vreinterpretq_s64_s32(b15.val[0])));
-  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[1]),
-                                            vreinterpretq_s64_s32(b15.val[1])));
-  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[1]),
-                                            vreinterpretq_s64_s32(b15.val[1])));
-  out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[0]),
-                                            vreinterpretq_s64_s32(b37.val[0])));
-  out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[0]),
-                                            vreinterpretq_s64_s32(b37.val[0])));
-  out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[1]),
-                                            vreinterpretq_s64_s32(b37.val[1])));
-  out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[1]),
-                                            vreinterpretq_s64_s32(b37.val[1])));
-#else
-  out[0] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b04.val[0], b04.val[0], 2), b15.val[0], 2));
-  out[1] = vreinterpretq_s16_s32(
-      vextq_s32(b04.val[0], vextq_s32(b15.val[0], b15.val[0], 2), 2));
-  out[2] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b04.val[1], b04.val[1], 2), b15.val[1], 2));
-  out[3] = vreinterpretq_s16_s32(
-      vextq_s32(b04.val[1], vextq_s32(b15.val[1], b15.val[1], 2), 2));
-  out[4] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b26.val[0], b26.val[0], 2), b37.val[0], 2));
-  out[5] = vreinterpretq_s16_s32(
-      vextq_s32(b26.val[0], vextq_s32(b37.val[0], b37.val[0], 2), 2));
-  out[6] = vreinterpretq_s16_s32(
-      vextq_s32(vextq_s32(b26.val[1], b26.val[1], 2), b37.val[1], 2));
-  out[7] = vreinterpretq_s16_s32(
-      vextq_s32(b26.val[1], vextq_s32(b37.val[1], b37.val[1], 2), 2));
-#endif
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
+    const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+    int32x4_t *out0, int32x4_t *out1) {
+  int32x4_t w0101 = vmovl_s16(w0101_s16);
+  int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+  o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
+  int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
+  o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+  *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+  *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
 }
 
-static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
-                                                      int32x4_t *output,
-                                                      const int size) {
-  int i;
-  for (i = 0; i < size; i++) {
-    output[i] = vrshrq_n_s32(vmulq_n_s32(vrshrq_n_s32(input[i], 2), NewSqrt2),
-                             NewSqrt2Bits);
-  }
-}
-
-static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
-                                                 int32x4_t *output,
-                                                 const int size) {
-  int i;
-  for (i = 0; i < size; i++) output[i] = vrshrq_n_s32(input[i], 2);
-}
-
-#define btf_32_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                       \
-    out0 = vmulq_n_s32(in0, w0);                             \
-    out0 = vmlaq_n_s32(out0, in1, w1);                       \
-    out0 = vrshlq_s32(out0, v_cos_bit);                      \
-    out1 = vmulq_n_s32(in0, w1);                             \
-    out1 = vmlsq_n_s32(out1, in1, w0);                       \
-    out1 = vrshlq_s32(out1, v_cos_bit);                      \
+#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+                                  out0, out1)                                 \
+  do {                                                                        \
+    int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0);                          \
+    u0 = vmlal_lane_s16(u0, in1, wvec, lane1);                                \
+    int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2);                          \
+    v0 = vmlal_lane_s16(v0, in1, wvec, lane3);                                \
+    *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                              \
+    *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                              \
   } while (0)
 
-#define btf_32_type1_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                             \
-    btf_32_neon(w1, w0, in1, in0, out0, out1, v_cos_bit);          \
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
+    const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+    int16x4_t *out0, int16x4_t *out1) {
+  butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+                                  out0, out1)                                 \
+  do {                                                                        \
+    int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0);            \
+    u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1);                  \
+    int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0);           \
+    u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1);                 \
+    int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2);            \
+    v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3);                  \
+    int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2);           \
+    v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3);                 \
+    const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX);                  \
+    const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX);                  \
+    const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX);                  \
+    const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX);                  \
+    *out0 = vcombine_s16(c0, c1);                                             \
+    *out1 = vcombine_s16(d0, d1);                                             \
   } while (0)
 
-#define btf_32_neon_mode0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                             \
-    out0 = vmulq_n_s32(in1, w1);                                   \
-    out0 = vmlsq_n_s32(out0, in0, w0);                             \
-    out0 = vrshlq_s32(out0, v_cos_bit);                            \
-    out1 = vmulq_n_s32(in0, w1);                                   \
-    out1 = vmlaq_n_s32(out1, in1, w0);                             \
-    out1 = vrshlq_s32(out1, v_cos_bit);                            \
-  } while (0)
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
 
-#define btf_32_neon_mode01(w0, w1, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                              \
-    out0 = vmulq_n_s32(in1, w1);                                    \
-    out0 = vmlaq_n_s32(out0, in0, w0);                              \
-    out0 = vrshlq_s32(vnegq_s32(out0), v_cos_bit);                  \
-    out1 = vmulq_n_s32(in1, w0);                                    \
-    out1 = vmlsq_n_s32(out1, in0, w1);                              \
-    out1 = vrshlq_s32(out1, v_cos_bit);                             \
-  } while (0)
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
 
-static INLINE void flip_buf_neon(int16x8_t *in, int16x8_t *out, int size) {
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
+    const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+    int16x8_t *out0, int16x8_t *out1) {
+  butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
+                                             int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
   }
 }
 
-static INLINE void store_16bit_to_32bit_w4(const int16x8_t a,
-                                           int32_t *const b) {
-  vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
+static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
+                                             int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
 }
 
-static INLINE void store_16bit_to_32bit(int16x8_t a, int32_t *b) {
-  vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
-  vst1q_s32((b + 4), vmovl_s16(vget_high_s16(a)));
-}
-
-static INLINE void store_output_32bit_w8(int32_t *const out,
-                                         const int32x4_t *const in1,
-                                         const int32x4_t *const in2,
-                                         const int stride, const int out_size) {
+static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
+    int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
+    const int stride, const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     vst1q_s32(out + stride * i, in1[i]);
     vst1q_s32(out + stride * i + 4, in2[i]);
   }
 }
 
-static INLINE void store_rect_16bit_to_32bit_w4(
-    const int16x8_t a, int32_t *const b, const int16x4_t *v_newsqrt2,
-    const int32x4_t *v_newsqrt2bits) {
-  const int32x4_t b_lo =
-      vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
-  vst1q_s32(b, b_lo);
-}
-
-static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
-                                             int32_t *const b,
-                                             const int16x4_t *v_newsqrt2,
-                                             const int32x4_t *v_newsqrt2bits) {
-  const int32x4_t b_lo =
-      vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
-  const int32x4_t b_hi =
-      vrshlq_s32(vmull_s16(vget_high_s16(a), *v_newsqrt2), *v_newsqrt2bits);
-  vst1q_s32(b, b_lo);
-  vst1q_s32((b + 4), b_hi);
-}
-
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *in,
-                                                 const int stride,
-                                                 int16x8_t *const out,
-                                                 const int out_size) {
+static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
+                                                const int stride,
+                                                int16x4_t *const out,
+                                                const int out_size) {
   for (int i = 0; i < out_size; ++i) {
-    // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
-    // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
-    // the upper lane is unused or further modified after this call. The
-    // latency should be similar between the two.
-    out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
+    out[i] = vld1_s16(in);
     in += stride;
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *in,
-                                                      const int stride,
-                                                      int16x8_t *const out,
-                                                      const int out_size) {
-  for (int i = out_size - 1; i >= 0; --i) {
-    // vld1q_dup_u64 is used rather than vld1q_lane_u64(lane=0) to avoid
-    // -Wmaybe-uninitialized warnings with some versions of gcc. This assumes
-    // the upper lane is unused or further modified after this call. The
-    // latency should be similar between the two.
-    out[i] = vreinterpretq_s16_u64(vld1q_dup_u64((uint64_t *)in));
-    in += stride;
-  }
-}
-
-static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
-                                              int16x8_t *out, int out_size) {
+static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
+                                                int16x8_t *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = vld1q_s16(in + i * stride);
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
-                                                   int stride, int16x8_t *out,
-                                                   int out_size) {
+static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
+                                                 int32_t *const out,
+                                                 const int stride,
+                                                 const int out_size) {
   for (int i = 0; i < out_size; ++i) {
-    out[out_size - i - 1] = vld1q_s16(in + i * stride);
+    vst1q_s32(out + i * stride, vmovl_s16(in[i]));
   }
 }
 
-static INLINE void store_buffer_16bit_to_32bit_w4(const int16x8_t *const in,
-                                                  int32_t *const out,
-                                                  const int stride,
-                                                  const int out_size) {
+static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
+                                                 int32_t *const out,
+                                                 const int stride,
+                                                 const int out_size) {
   for (int i = 0; i < out_size; ++i) {
-    store_16bit_to_32bit_w4(in[i], out + i * stride);
+    vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
+    vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
   }
 }
 
-static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
-                                                  int32_t *const out,
-                                                  const int stride,
-                                                  const int out_size) {
+// A note on naming:
+//   round_shift_[sqrt2]_s16_s32_4x1_neon(...)
+//                |      |   |     ^ 1 => a single vector
+//                |      |   |       n => an array of vectors
+//                |      |   |   ^ input/output vector element count
+//                |      |   ^ output type
+//                |      ^ input type
+//                ^ multiplicand and shift identifier
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+  return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+  return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+                      round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+  return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+  return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+                      round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
+  return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
+  return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
+}
+
+#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn)                 \
+  static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
+    for (int i = 0; i < size; ++i) {                                         \
+      out[i] = fn(in[i]);                                                    \
+    }                                                                        \
+  }
+
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
+                             int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
+                             int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
+                             int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
+                             int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
+                             int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
+                                                      int32_t *const out,
+                                                      const int stride,
+                                                      const int out_size) {
   for (int i = 0; i < out_size; ++i) {
-    store_16bit_to_32bit(in[i], out + i * stride);
+    vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
   }
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w4(
-    const int16x8_t *const in, int32_t *const out, const int stride,
-    const int out_size) {
-  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
-  const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
+                                                      int32_t *const out,
+                                                      const int stride,
+                                                      const int out_size) {
   for (int i = 0; i < out_size; ++i) {
-    store_rect_16bit_to_32bit_w4(in[i], out + i * stride, &v_newsqrt2,
-                                 &v_newsqrt2bits);
+    vst1q_s32(out + i * stride + 0,
+              round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
+    vst1q_s32(out + i * stride + 4,
+              round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
   }
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w8(
-    const int16x8_t *const in, int32_t *const out, const int stride,
-    const int out_size) {
-  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
-  const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
-  for (int i = 0; i < out_size; ++i) {
-    store_rect_16bit_to_32bit(in[i], out + i * stride, &v_newsqrt2,
-                              &v_newsqrt2bits);
-  }
-}
-
-static INLINE void round_shift_16bit(int16x8_t *in, int size, int bit) {
-  const int16x8_t vbit = vdupq_n_s16(bit);
-  for (int i = 0; i < size; ++i) {
-    in[i] = vrshlq_s16(in[i], vbit);
-  }
-}
-
-static INLINE void round_shift_16bit_vector(int16x8_t *in, int size,
-                                            const int16x8_t *v_bit) {
-  for (int i = 0; i < size; ++i) {
-    in[i] = vrshlq_s16(in[i], *v_bit);
-  }
-}
-
-void av1_fadst4x4_neon(const int16x8_t *input, int16x8_t *output,
-                       int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *sinpi = sinpi_arr(cos_bit);
-
+static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
   int32x4_t u[6], v[6];
+  const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+  const int16x4_t u01 = vqadd_s16(input[0], input[1]);
 
-  u[0] = vmovl_s16(vget_low_s16(input[0]));
-  u[1] = vmovl_s16(vget_low_s16(input[1]));
-  u[2] = vmovl_s16(vget_low_s16(input[2]));
-  u[3] = vmovl_s16(vget_low_s16(input[3]));
-  u[4] = vaddq_s32(u[0], u[1]);
-  v[5] = vmulq_n_s32(u[2], sinpi[3]);
-  v[0] = vmulq_n_s32(u[1], sinpi[2]);
-  v[0] = vmlaq_n_s32(v[0], u[0], sinpi[1]);
-  v[1] = vmlaq_n_s32(v[5], u[3], sinpi[4]);
-  v[2] = vmulq_n_s32(u[4], sinpi[3]);
-  v[3] = vmulq_n_s32(u[0], sinpi[4]);
-  v[3] = vmlsq_n_s32(v[3], u[1], sinpi[1]);
-  v[4] = vmlsq_n_s32(v[5], u[3], sinpi[2]);
+  v[5] = vmull_lane_s16(input[2], sinpi, 2);
+  v[0] = vmull_lane_s16(input[1], sinpi, 1);
+  v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
+  v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
+  v[2] = vmull_lane_s16(u01, sinpi, 2);
+  v[3] = vmull_lane_s16(input[0], sinpi, 3);
+  v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
+  v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
 
   u[0] = vaddq_s32(v[0], v[1]);
-  u[1] = vmlsq_n_s32(v[2], u[3], sinpi[3]);
+  u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
   u[2] = vsubq_s32(v[3], v[4]);
   u[3] = vsubq_s32(u[2], u[0]);
-  u[5] = vmlaq_n_s32(u[3], v[5], 3);
+  u[3] = vmlaq_n_s32(u[3], v[5], 3);
 
-  int32x4_t vshift = vdupq_n_s32(-cos_bit);
-  u[0] = vrshlq_s32(u[0], vshift);
-  u[1] = vrshlq_s32(u[1], vshift);
-  u[2] = vrshlq_s32(u[2], vshift);
-  u[3] = vrshlq_s32(u[5], vshift);
-
-  output[0] = custom_packs_s32(u[0], u[2]);
-
-  output[1] = custom_packs_s32(u[1], u[3]);
-  output[2] = vextq_s16(output[0], output[0], 4);
-  output[3] = vextq_s16(output[1], output[1], 4);
+  output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+  output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+  output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+  output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
 }
 
-#define btf_16_w4_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1, \
-                       v_cos_bit)                                    \
-  do {                                                               \
-    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                  \
-    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                  \
-    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                         \
-    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                               \
-    int32x4_t v0 = vmulq_n_s32(in0_l, w1_l);                         \
-    v0 = vmlaq_n_s32(v0, in1_l, w1_h);                               \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                        \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                        \
-    const int16x4_t c1 = vqmovn_s32(c0);                             \
-    const int16x4_t d1 = vqmovn_s32(d0);                             \
-    out0 = vcombine_s16(c1, c1);                                     \
-    out1 = vcombine_s16(d1, c1);                                     \
-  } while (0)
+static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
 
-#define btf_16_w4_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                    \
-    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
-    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
-    int32x4_t u0 = vmulq_n_s32(in1_l, w0_h);                              \
-    u0 = vmlsq_n_s32(u0, in0_l, w0_l);                                    \
-    int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
-    v0 = vmlaq_n_s32(v0, in1_l, w0_l);                                    \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
-    const int16x4_t c1 = vqmovn_s32(c0);                                  \
-    const int16x4_t d1 = vqmovn_s32(d0);                                  \
-    out0 = vcombine_s16(c1, c1);                                          \
-    out1 = vcombine_s16(d1, c1);                                          \
-  } while (0)
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
 
-#define btf_16_w4_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                    \
-    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
-    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
-    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
-    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
-    int32x4_t v0 = vmulq_n_s32(in1_l, w0_l);                              \
-    v0 = vmlsq_n_s32(v0, in0_l, w0_h);                                    \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
-    const int16x4_t c1 = vqmovn_s32(c0);                                  \
-    const int16x4_t d1 = vqmovn_s32(d0);                                  \
-    out0 = vcombine_s16(c1, c1);                                          \
-    out1 = vcombine_s16(d1, c1);                                          \
-  } while (0)
-
-#define btf_16_w4_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                    \
-    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
-    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
-    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
-    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
-    int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
-    v0 = vmlsq_n_s32(v0, in1_l, w0_l);                                    \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
-    const int16x4_t c1 = vqmovn_s32(c0);                                  \
-    const int16x4_t d1 = vqmovn_s32(d0);                                  \
-    out0 = vcombine_s16(c1, c1);                                          \
-    out1 = vcombine_s16(d1, c1);                                          \
-  } while (0)
-
-static void fadst4x8_neon(const int16x8_t *input, int16x8_t *output,
-                          int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 1-2
-  int16x8_t x2[8];
-  btf_16_w4_neon_mode3(cospi[32], cospi[32], vqnegq_s16(input[3]), input[4],
-                       x2[2], x2[3], v_cos_bit);
-  btf_16_w4_neon_mode3(cospi[32], cospi[32], input[2], vqnegq_s16(input[5]),
-                       x2[6], x2[7], v_cos_bit);
+  int16x4_t x2[8];
+  butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
 
   // stage 3
-  int16x8_t x3[8];
-  x3[0] = vqaddq_s16(input[0], x2[2]);
-  x3[2] = vqsubq_s16(input[0], x2[2]);
-  x3[1] = vqsubq_s16(x2[3], input[7]);
-  x3[3] = vqsubq_s16(vqnegq_s16(input[7]), x2[3]);
-  x3[4] = vqaddq_s16(vqnegq_s16(input[1]), x2[6]);
-  x3[6] = vqsubq_s16(vqnegq_s16(input[1]), x2[6]);
-  x3[5] = vqaddq_s16(input[6], x2[7]);
-  x3[7] = vqsubq_s16(input[6], x2[7]);
+  int16x4_t x3[8];
+  x3[0] = vqadd_s16(input[0], x2[2]);
+  x3[1] = vqsub_s16(x2[3], input[7]);
+  x3[2] = vqsub_s16(input[0], x2[2]);
+  x3[3] = vqadd_s16(input[7], x2[3]);
+  x3[4] = vqsub_s16(x2[6], input[1]);
+  x3[5] = vqadd_s16(input[6], x2[7]);
+  x3[6] = vqadd_s16(input[1], x2[6]);
+  x3[7] = vqsub_s16(input[6], x2[7]);
 
   // stage 4
-  int16x8_t x4[8];
-
-  btf_16_w4_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x4[4], x4[5],
-                       v_cos_bit);
-  btf_16_w4_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x4[6], x4[7],
-                       v_cos_bit);
+  int16x4_t x4[8];
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
 
   // stage 5
-  int16x8_t x5[8];
-  x5[0] = vqaddq_s16(x3[0], x4[4]);
-  x5[4] = vqsubq_s16(x3[0], x4[4]);
-  x5[1] = vqaddq_s16(x3[1], x4[5]);
-  x5[5] = vqsubq_s16(x3[1], x4[5]);
-  x5[2] = vqaddq_s16(x3[2], x4[6]);
-  x5[6] = vqsubq_s16(x3[2], x4[6]);
-  x5[3] = vqaddq_s16(x3[3], x4[7]);
-  x5[7] = vqsubq_s16(x3[3], x4[7]);
+  int16x4_t x5[8];
+  x5[0] = vqadd_s16(x3[0], x4[4]);
+  x5[1] = vqadd_s16(x3[1], x4[5]);
+  x5[2] = vqadd_s16(x3[2], x4[6]);
+  x5[3] = vqsub_s16(x4[7], x3[3]);
+  x5[4] = vqsub_s16(x3[0], x4[4]);
+  x5[5] = vqsub_s16(x3[1], x4[5]);
+  x5[6] = vqsub_s16(x3[2], x4[6]);
+  x5[7] = vqadd_s16(x3[3], x4[7]);
 
   // stage 6-7
-  btf_16_w4_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
-                       v_cos_bit);
-  btf_16_w4_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
-                       v_cos_bit);
-  btf_16_w4_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
-                       v_cos_bit);
-  btf_16_w4_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
-                       v_cos_bit);
+  butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+  butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+  butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+  butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
 }
 
-static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
-                          int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *sinpi = sinpi_arr(cos_bit);
+static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  int32x4_t u_lo[4], u_hi[4];
+  const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+  const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
 
-  const int16x8_t in7 = vaddq_s16(input[0], input[1]);
-  int32x4_t u_lo[8], u_hi[8], v_hi[8];
+  u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
+  u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
 
-  int32x4_t in0_l = vmovl_s16(vget_low_s16(input[0]));
-  int32x4_t in0_h = vmovl_s16(vget_high_s16(input[0]));
-  int32x4_t in1_l = vmovl_s16(vget_low_s16(input[1]));
-  int32x4_t in1_h = vmovl_s16(vget_high_s16(input[1]));
-  int32x4_t in2_l = vmovl_s16(vget_low_s16(input[2]));
-  int32x4_t in2_h = vmovl_s16(vget_high_s16(input[2]));
-  int32x4_t in3_l = vmovl_s16(vget_low_s16(input[3]));
-  int32x4_t in3_h = vmovl_s16(vget_high_s16(input[3]));
-  int32x4_t in7_l = vmovl_s16(vget_low_s16(in7));
-  int32x4_t in7_h = vmovl_s16(vget_high_s16(in7));
+  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
+  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
 
-  u_lo[0] = vmulq_n_s32(in1_l, sinpi[2]);
-  u_lo[0] = vmlaq_n_s32(u_lo[0], in0_l, sinpi[1]);
+  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
+  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
 
-  u_hi[0] = vmulq_n_s32(in1_h, sinpi[2]);
-  u_hi[0] = vmlaq_n_s32(u_hi[0], in0_h, sinpi[1]);
+  u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
+  u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
 
-  u_lo[0] = vmlaq_n_s32(u_lo[0], in3_l, sinpi[4]);
-  u_lo[0] = vmlaq_n_s32(u_lo[0], in2_l, sinpi[3]);
+  u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
+  u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
 
-  u_hi[0] = vmlaq_n_s32(u_hi[0], in3_h, sinpi[4]);
-  u_hi[0] = vmlaq_n_s32(u_hi[0], in2_h, sinpi[3]);
+  u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
+  u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
 
-  u_lo[1] = vmulq_n_s32(in7_l, sinpi[3]);
+  u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
+  u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
 
-  v_hi[2] = vmulq_n_s32(in7_h, sinpi[3]);
-  u_lo[2] = vmulq_n_s32(in0_l, sinpi[4]);
-  u_lo[2] = vmlsq_n_s32(u_lo[2], in1_l, sinpi[1]);
+  u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
+  u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
 
-  u_hi[2] = vmulq_n_s32(in0_h, sinpi[4]);
-  u_hi[2] = vmlsq_n_s32(u_hi[2], in1_h, sinpi[1]);
+  u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
+  u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
 
-  u_lo[2] = vmlaq_n_s32(u_lo[2], in3_l, sinpi[2]);
-  u_lo[2] = vmlsq_n_s32(u_lo[2], in2_l, sinpi[3]);
-
-  u_hi[2] = vmlaq_n_s32(u_hi[2], in3_h, sinpi[2]);
-  u_hi[2] = vmlsq_n_s32(u_hi[2], in2_h, sinpi[3]);
-
-  u_lo[1] = vmlsq_n_s32(u_lo[1], in3_l, sinpi[3]);
-
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
-  u_hi[1] = vmlsq_n_s32(v_hi[2], in3_h, sinpi[3]);
+  u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
+  u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
 
   u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
   u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
 
-  u_lo[6] = vmlaq_n_s32(u_lo[3], in2_l, sinpi[3] * 3);
-  u_hi[6] = vmlaq_n_s32(u_hi[3], in2_h, sinpi[3] * 3);
+  const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
+  u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
+  u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
 
-  u_lo[0] = vrshlq_s32(u_lo[0], v_cos_bit);
-  u_hi[0] = vrshlq_s32(u_hi[0], v_cos_bit);
-  u_lo[1] = vrshlq_s32(u_lo[1], v_cos_bit);
-  u_hi[1] = vrshlq_s32(u_hi[1], v_cos_bit);
-  u_lo[2] = vrshlq_s32(u_lo[2], v_cos_bit);
-  u_hi[2] = vrshlq_s32(u_hi[2], v_cos_bit);
-  u_lo[3] = vrshlq_s32(u_lo[6], v_cos_bit);
-  u_hi[3] = vrshlq_s32(u_hi[6], v_cos_bit);
-
-  output[0] = custom_packs_s32(u_lo[0], u_hi[0]);
-  output[1] = custom_packs_s32(u_lo[1], u_hi[1]);
-  output[2] = custom_packs_s32(u_lo[2], u_hi[2]);
-  output[3] = custom_packs_s32(u_lo[3], u_hi[3]);
+  output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
+  output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
+  output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
+  output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
+                           vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
 }
 
-void av1_fdct4x4_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
-                      const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
+                                          int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
+
+  int16x4_t in12a = vadd_s16(input[1], input[2]);
+  int16x4_t in12s = vsub_s16(input[1], input[2]);
+  int16x4_t in03a = vadd_s16(input[0], input[3]);
+  int16x4_t in03s = vsub_s16(input[0], input[3]);
+
+  int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
+  int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
 
   int32x4_t u[4];
-
-  int32x4_t in12a = vaddl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
-  int32x4_t in12s = vsubl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
-  int32x4_t in03a = vaddl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
-  int32x4_t in03s = vsubl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
-
-  int32x4_t u0ad1 = vmulq_n_s32(in12a, cospi[32]);
-  int32x4_t u0ad2 = vmulq_n_s32(in03a, cospi[32]);
   u[0] = vaddq_s32(u0ad1, u0ad2);
   u[1] = vsubq_s32(u0ad2, u0ad1);
-  u[2] = vmulq_n_s32(in12s, cospi[48]);
-  u[2] = vmlaq_n_s32(u[2], in03s, cospi[16]);
+  u[2] = vmull_lane_s16(in12s, cospi16, 1);
+  u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
+  u[3] = vmull_lane_s16(in03s, cospi16, 1);
+  u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
 
-  u[3] = vmulq_n_s32(in03s, cospi[48]);
-  u[3] = vmlsq_n_s32(u[3], in12s, cospi[16]);
-
-  u[0] = vrshlq_s32(u[0], v_cos_bit);
-  u[1] = vrshlq_s32(u[1], v_cos_bit);
-  u[2] = vrshlq_s32(u[2], v_cos_bit);
-  u[3] = vrshlq_s32(u[3], v_cos_bit);
-
-  output[0] = custom_packs_s32(u[0], u[1]);
-  output[1] = custom_packs_s32(u[2], u[3]);
-  output[2] = vextq_s16(output[0], output[0], 4);
-  output[3] = vextq_s16(output[1], output[1], 4);
+  output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+  output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+  output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+  output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
 }
 
-#define btf_16_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1) \
-  do {                                                            \
-    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));             \
-    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));           \
-    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));             \
-    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));           \
-    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                    \
-    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                          \
-    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                   \
-    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                         \
-    int32x4_t v0 = vmulq_n_s32(in_low1, w1_h);                    \
-    v0 = vmlaq_n_s32(v0, in_low0, w1_l);                          \
-    int32x4_t v1 = vmulq_n_s32(in_high1, w1_h);                   \
-    v1 = vmlaq_n_s32(v1, in_high0, w1_l);                         \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                     \
-    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                     \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                     \
-    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                     \
-    out0 = custom_packs_s32(c0, c1);                              \
-    out1 = custom_packs_s32(d0, d1);                              \
-  } while (0)
+// Butterfly pre-processing:
+// e.g. n=4:
+//   out[0] = in[0] + in[3]
+//   out[1] = in[1] + in[2]
+//   out[2] = in[1] - in[2]
+//   out[3] = in[0] - in[3]
 
-#define btf_16_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                 \
-    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
-    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
-    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
-    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
-    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
-    u0 = vmlsq_n_s32(u0, in_low0, w0_l);                               \
-    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
-    u1 = vmlsq_n_s32(u1, in_high0, w0_l);                              \
-    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
-    v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
-    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
-    v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
-    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
-    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
-    out0 = custom_packs_s32(c0, c1);                                   \
-    out1 = custom_packs_s32(d0, d1);                                   \
-  } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
+                                                      int16x4_t *output,
+                                                      int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vqadd_s16(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
 
-#define btf_16_neon_mode1(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                 \
-    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
-    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
-    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
-    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
-    int32x4_t u0 = vmulq_n_s32(in_low0, w0_l);                         \
-    u0 = vmlsq_n_s32(u0, in_low1, w0_h);                               \
-    int32x4_t u1 = vmulq_n_s32(in_high0, w0_l);                        \
-    u1 = vmlsq_n_s32(u1, in_high1, w0_h);                              \
-    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
-    v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
-    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
-    v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
-    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
-    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
-    out0 = custom_packs_s32(c0, c1);                                   \
-    out1 = custom_packs_s32(d0, d1);                                   \
-  } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
+                                                      int16x8_t *output,
+                                                      int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vqaddq_s16(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
 
-#define btf_16_neon_mode02(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                  \
-    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                   \
-    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                 \
-    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                   \
-    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                 \
-    int32x4_t u0 = vmulq_n_s32(in_low1, -w0_h);                         \
-    u0 = vmlsq_n_s32(u0, in_low0, w0_l);                                \
-    int32x4_t u1 = vmulq_n_s32(in_high1, -w0_h);                        \
-    u1 = vmlsq_n_s32(u1, in_high0, w0_l);                               \
-    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                          \
-    v0 = vmlsq_n_s32(v0, in_low0, w0_h);                                \
-    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                         \
-    v1 = vmlsq_n_s32(v1, in_high0, w0_h);                               \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                           \
-    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                           \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                           \
-    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                           \
-    out0 = custom_packs_s32(c0, c1);                                    \
-    out1 = custom_packs_s32(d0, d1);                                    \
-  } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vqaddq_s32(input[i], input[n - i - 1]);
+  }
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+  }
+}
 
-#define btf_16_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                 \
-    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
-    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
-    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
-    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
-    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
-    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
-    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
-    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
-    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
-    v0 = vmlsq_n_s32(v0, in_low0, w0_h);                               \
-    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
-    v1 = vmlsq_n_s32(v1, in_high0, w0_h);                              \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
-    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
-    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
-    out0 = custom_packs_s32(c0, c1);                                   \
-    out1 = custom_packs_s32(d0, d1);                                   \
-  } while (0)
+// Butterfly post-processing:
+// e.g. n=8:
+//   out[0] = in0[0] + in1[3];
+//   out[1] = in0[1] + in1[2];
+//   out[2] = in0[1] - in1[2];
+//   out[3] = in0[0] - in1[3];
+//   out[4] = in0[7] - in1[4];
+//   out[5] = in0[6] - in1[5];
+//   out[6] = in0[6] + in1[5];
+//   out[7] = in0[7] + in1[4];
 
-#define btf_16_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                                 \
-    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
-    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
-    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
-    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
-    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
-    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
-    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
-    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
-    int32x4_t v0 = vmulq_n_s32(in_low0, w0_h);                         \
-    v0 = vmlsq_n_s32(v0, in_low1, w0_l);                               \
-    int32x4_t v1 = vmulq_n_s32(in_high0, w0_h);                        \
-    v1 = vmlsq_n_s32(v1, in_high1, w0_l);                              \
-    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
-    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
-    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
-    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
-    out0 = custom_packs_s32(c0, c1);                                   \
-    out1 = custom_packs_s32(d0, d1);                                   \
-  } while (0)
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
+                                                       const int16x4_t *in1,
+                                                       int16x4_t *output,
+                                                       int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
 
-static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
-                         int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
+                                                       const int16x8_t *in1,
+                                                       int16x8_t *output,
+                                                       int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
+                                                       const int32x4_t *in1,
+                                                       int32x4_t *output,
+                                                       int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
+}
+
+static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
+                                          int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
 
   // stage 1
   int16x8_t x1[4];
-  x1[0] = vqaddq_s16(input[0], input[3]);
-  x1[3] = vqsubq_s16(input[0], input[3]);
-  x1[1] = vqaddq_s16(input[1], input[2]);
-  x1[2] = vqsubq_s16(input[1], input[2]);
+  butterfly_dct_pre_s16_x8(input, x1, 4);
 
   // stage 2
   int16x8_t x2[4];
-  btf_16_neon_mode3(cospi[32], cospi[32], x1[0], x1[1], x2[0], x2[1],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[48], cospi[16], x1[2], x1[3], x2[2], x2[3],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
 
   // stage 3
   output[0] = x2[0];
@@ -806,988 +611,528 @@
   output[3] = x2[3];
 }
 
-static void fdct4x8_neon(const int16x8_t *input, int16x8_t *output,
-                         int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
+                                          int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+  // stage 1
+  int16x4_t x1[8];
+  butterfly_dct_pre_s16_x4(input, x1, 8);
+
+  // stage 2
+  int16x4_t x2[8];
+  butterfly_dct_pre_s16_x4(x1, x2, 4);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+  // stage 3
+  int16x4_t x3[8];
+  butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+  butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
+
+  // stage 4-5
+  butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
+                                          int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
 
   // stage 1
   int16x8_t x1[8];
-  x1[0] = vqaddq_s16(input[0], input[7]);
-  x1[7] = vqsubq_s16(input[0], input[7]);
-  x1[1] = vqaddq_s16(input[1], input[6]);
-  x1[6] = vqsubq_s16(input[1], input[6]);
-  x1[2] = vqaddq_s16(input[2], input[5]);
-  x1[5] = vqsubq_s16(input[2], input[5]);
-  x1[3] = vqaddq_s16(input[3], input[4]);
-  x1[4] = vqsubq_s16(input[3], input[4]);
+  butterfly_dct_pre_s16_x8(input, x1, 8);
 
   // stage 2
   int16x8_t x2[8];
-  x2[0] = vqaddq_s16(x1[0], x1[3]);
-  x2[3] = vqsubq_s16(x1[0], x1[3]);
-  x2[1] = vqaddq_s16(x1[1], x1[2]);
-  x2[2] = vqsubq_s16(x1[1], x1[2]);
-
-  btf_16_w4_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
-                       v_cos_bit);
+  butterfly_dct_pre_s16_x8(x1, x2, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
 
   // stage 3
   int16x8_t x3[8];
-  btf_16_w4_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
-                       v_cos_bit);
-
-  btf_16_w4_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
-                       v_cos_bit);
-  x3[4] = vqaddq_s16(x1[4], x2[5]);
-  x3[5] = vqsubq_s16(x1[4], x2[5]);
-  x3[6] = vqsubq_s16(x1[7], x2[6]);
-  x3[7] = vqaddq_s16(x1[7], x2[6]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+  butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
 
   // stage 4-5
-  btf_16_w4_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
-                       v_cos_bit);
-  btf_16_w4_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
-                       v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
 }
 
-void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
-                  const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 1
-  int16x8_t x1[8];
-  x1[0] = vqaddq_s16(input[0], input[7]);
-  x1[7] = vqsubq_s16(input[0], input[7]);
-  x1[1] = vqaddq_s16(input[1], input[6]);
-  x1[6] = vqsubq_s16(input[1], input[6]);
-  x1[2] = vqaddq_s16(input[2], input[5]);
-  x1[5] = vqsubq_s16(input[2], input[5]);
-  x1[3] = vqaddq_s16(input[3], input[4]);
-  x1[4] = vqsubq_s16(input[3], input[4]);
+  int16x4_t x1[16];
+  butterfly_dct_pre_s16_x4(input, x1, 16);
 
   // stage 2
-  int16x8_t x2[8];
-  x2[0] = vqaddq_s16(x1[0], x1[3]);
-  x2[3] = vqsubq_s16(x1[0], x1[3]);
-  x2[1] = vqaddq_s16(x1[1], x1[2]);
-  x2[2] = vqsubq_s16(x1[1], x1[2]);
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
-                    v_cos_bit);
+  int16x4_t x2[16];
+  butterfly_dct_pre_s16_x4(x1, x2, 8);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
 
   // stage 3
-  int16x8_t x3[8];
-  btf_16_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
-                    v_cos_bit);
-  x3[4] = vqaddq_s16(x1[4], x2[5]);
-  x3[5] = vqsubq_s16(x1[4], x2[5]);
-  x3[6] = vqsubq_s16(x1[7], x2[6]);
-  x3[7] = vqaddq_s16(x1[7], x2[6]);
+  int16x4_t x3[16];
+  butterfly_dct_pre_s16_x4(x2, x3, 4);
+  butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+  butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
 
-  // stage 4-5
-  btf_16_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
-                    v_cos_bit);
+  // stage 4
+  int16x4_t x4[16];
+  butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
+                                 &output[12]);
+  butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+  butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+  // stage 5
+  int16x4_t x5[16];
+  butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
+                                 &output[6]);
+  butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
+  butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
+
+  // stage 6-7
+  butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
+                                 &output[15]);
+  butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
+                                 &output[7]);
+  butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
+                                 &output[11]);
+  butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
+                                 &output[3]);
 }
 
-static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
-                          int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 1
   int16x8_t x1[16];
-  x1[0] = vqaddq_s16(input[0], input[15]);
-  x1[15] = vqsubq_s16(input[0], input[15]);
-  x1[1] = vqaddq_s16(input[1], input[14]);
-  x1[14] = vqsubq_s16(input[1], input[14]);
-  x1[2] = vqaddq_s16(input[2], input[13]);
-  x1[13] = vqsubq_s16(input[2], input[13]);
-  x1[3] = vqaddq_s16(input[3], input[12]);
-  x1[12] = vqsubq_s16(input[3], input[12]);
-  x1[4] = vqaddq_s16(input[4], input[11]);
-  x1[11] = vqsubq_s16(input[4], input[11]);
-  x1[5] = vqaddq_s16(input[5], input[10]);
-  x1[10] = vqsubq_s16(input[5], input[10]);
-  x1[6] = vqaddq_s16(input[6], input[9]);
-  x1[9] = vqsubq_s16(input[6], input[9]);
-  x1[7] = vqaddq_s16(input[7], input[8]);
-  x1[8] = vqsubq_s16(input[7], input[8]);
+  butterfly_dct_pre_s16_x8(input, x1, 16);
 
   // stage 2
   int16x8_t x2[16];
-  x2[0] = vqaddq_s16(x1[0], x1[7]);
-  x2[7] = vqsubq_s16(x1[0], x1[7]);
-  x2[1] = vqaddq_s16(x1[1], x1[6]);
-  x2[6] = vqsubq_s16(x1[1], x1[6]);
-  x2[2] = vqaddq_s16(x1[2], x1[5]);
-  x2[5] = vqsubq_s16(x1[2], x1[5]);
-  x2[3] = vqaddq_s16(x1[3], x1[4]);
-  x2[4] = vqsubq_s16(x1[3], x1[4]);
-
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[10], x1[13], x2[10], x2[13],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[11], x1[12], x2[11], x2[12],
-                    v_cos_bit);
+  butterfly_dct_pre_s16_x8(x1, x2, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
 
   // stage 3
   int16x8_t x3[16];
-  x3[0] = vqaddq_s16(x2[0], x2[3]);
-  x3[3] = vqsubq_s16(x2[0], x2[3]);
-  x3[1] = vqaddq_s16(x2[1], x2[2]);
-  x3[2] = vqsubq_s16(x2[1], x2[2]);
-
-  btf_16_neon_mode0(cospi[32], cospi[32], x2[5], x2[6], x3[5], x3[6],
-                    v_cos_bit);
-
-  x3[8] = vqaddq_s16(x1[8], x2[11]);
-  x3[11] = vqsubq_s16(x1[8], x2[11]);
-  x3[9] = vqaddq_s16(x1[9], x2[10]);
-  x3[10] = vqsubq_s16(x1[9], x2[10]);
-  x3[12] = vqsubq_s16(x1[15], x2[12]);
-  x3[15] = vqaddq_s16(x1[15], x2[12]);
-  x3[13] = vqsubq_s16(x1[14], x2[13]);
-  x3[14] = vqaddq_s16(x1[14], x2[13]);
+  butterfly_dct_pre_s16_x8(x2, x3, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+  butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
 
   // stage 4
   int16x8_t x4[16];
-  btf_16_neon(cospi[32], cospi[32], cospi[32], -cospi[32], x3[0], x3[1],
-              output[0], output[8]);
-  btf_16_neon(cospi[48], cospi[16], -cospi[16], cospi[48], x3[2], x3[3],
-              output[4], output[12]);
-  x4[4] = vqaddq_s16(x2[4], x3[5]);
-  x4[5] = vqsubq_s16(x2[4], x3[5]);
-  x4[6] = vqsubq_s16(x2[7], x3[6]);
-  x4[7] = vqaddq_s16(x2[7], x3[6]);
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[9], x3[14], x4[9], x4[14],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[10], x3[13], x4[10], x4[13],
-                     v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
+                                 &output[12]);
+  butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
 
   // stage 5
   int16x8_t x5[16];
-
-  btf_16_neon_mode2(cospi[56], cospi[8], x4[4], x4[7], output[2], output[14],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[24], cospi[40], x4[5], x4[6], output[10], output[6],
-                    v_cos_bit);
-  x5[8] = vqaddq_s16(x3[8], x4[9]);
-  x5[9] = vqsubq_s16(x3[8], x4[9]);
-  x5[10] = vqsubq_s16(x3[11], x4[10]);
-  x5[11] = vqaddq_s16(x3[11], x4[10]);
-  x5[12] = vqaddq_s16(x3[12], x4[13]);
-  x5[13] = vqsubq_s16(x3[12], x4[13]);
-  x5[14] = vqsubq_s16(x3[15], x4[14]);
-  x5[15] = vqaddq_s16(x3[15], x4[14]);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
+                                 &output[6]);
+  butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
+  butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
 
   // stage 6-7
-  btf_16_neon_mode2(cospi[60], cospi[4], x5[8], x5[15], output[1], output[15],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[28], cospi[36], x5[9], x5[14], output[9], output[7],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[44], cospi[20], x5[10], x5[13], output[5], output[11],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[12], cospi[52], x5[11], x5[12], output[13], output[3],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
+                                 &output[15]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
+                                 &output[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
+                                 &output[11]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
+                                 &output[3]);
 }
 
-void av1_fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
-                       int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
 
   // stage 1
   int16x8_t x1[32];
-  x1[0] = vqaddq_s16(input[0], input[31]);
-  x1[31] = vqsubq_s16(input[0], input[31]);
-  x1[1] = vqaddq_s16(input[1], input[30]);
-  x1[30] = vqsubq_s16(input[1], input[30]);
-  x1[2] = vqaddq_s16(input[2], input[29]);
-  x1[29] = vqsubq_s16(input[2], input[29]);
-  x1[3] = vqaddq_s16(input[3], input[28]);
-  x1[28] = vqsubq_s16(input[3], input[28]);
-  x1[4] = vqaddq_s16(input[4], input[27]);
-  x1[27] = vqsubq_s16(input[4], input[27]);
-  x1[5] = vqaddq_s16(input[5], input[26]);
-  x1[26] = vqsubq_s16(input[5], input[26]);
-  x1[6] = vqaddq_s16(input[6], input[25]);
-  x1[25] = vqsubq_s16(input[6], input[25]);
-  x1[7] = vqaddq_s16(input[7], input[24]);
-  x1[24] = vqsubq_s16(input[7], input[24]);
-  x1[8] = vqaddq_s16(input[8], input[23]);
-  x1[23] = vqsubq_s16(input[8], input[23]);
-  x1[9] = vqaddq_s16(input[9], input[22]);
-  x1[22] = vqsubq_s16(input[9], input[22]);
-  x1[10] = vqaddq_s16(input[10], input[21]);
-  x1[21] = vqsubq_s16(input[10], input[21]);
-  x1[11] = vqaddq_s16(input[11], input[20]);
-  x1[20] = vqsubq_s16(input[11], input[20]);
-  x1[12] = vqaddq_s16(input[12], input[19]);
-  x1[19] = vqsubq_s16(input[12], input[19]);
-  x1[13] = vqaddq_s16(input[13], input[18]);
-  x1[18] = vqsubq_s16(input[13], input[18]);
-  x1[14] = vqaddq_s16(input[14], input[17]);
-  x1[17] = vqsubq_s16(input[14], input[17]);
-  x1[15] = vqaddq_s16(input[15], input[16]);
-  x1[16] = vqsubq_s16(input[15], input[16]);
+  butterfly_dct_pre_s16_x8(input, x1, 32);
 
   // stage 2
   int16x8_t x2[32];
-  x2[0] = vqaddq_s16(x1[0], x1[15]);
-  x2[15] = vqsubq_s16(x1[0], x1[15]);
-  x2[1] = vqaddq_s16(x1[1], x1[14]);
-  x2[14] = vqsubq_s16(x1[1], x1[14]);
-  x2[2] = vqaddq_s16(x1[2], x1[13]);
-  x2[13] = vqsubq_s16(x1[2], x1[13]);
-  x2[3] = vqaddq_s16(x1[3], x1[12]);
-  x2[12] = vqsubq_s16(x1[3], x1[12]);
-  x2[4] = vqaddq_s16(x1[4], x1[11]);
-  x2[11] = vqsubq_s16(x1[4], x1[11]);
-  x2[5] = vqaddq_s16(x1[5], x1[10]);
-  x2[10] = vqsubq_s16(x1[5], x1[10]);
-  x2[6] = vqaddq_s16(x1[6], x1[9]);
-  x2[9] = vqsubq_s16(x1[6], x1[9]);
-  x2[7] = vqaddq_s16(x1[7], x1[8]);
-  x2[8] = vqsubq_s16(x1[7], x1[8]);
-
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[20], x1[27], x2[20], x2[27],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[21], x1[26], x2[21], x2[26],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[22], x1[25], x2[22], x2[25],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[32], cospi[32], x1[23], x1[24], x2[23], x2[24],
-                    v_cos_bit);
+  butterfly_dct_pre_s16_x8(x1, x2, 16);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
 
   // stage 3
   int16x8_t x3[32];
-  x3[0] = vqaddq_s16(x2[0], x2[7]);
-  x3[7] = vqsubq_s16(x2[0], x2[7]);
-  x3[1] = vqaddq_s16(x2[1], x2[6]);
-  x3[6] = vqsubq_s16(x2[1], x2[6]);
-  x3[2] = vqaddq_s16(x2[2], x2[5]);
-  x3[5] = vqsubq_s16(x2[2], x2[5]);
-  x3[3] = vqaddq_s16(x2[3], x2[4]);
-  x3[4] = vqsubq_s16(x2[3], x2[4]);
-
-  btf_16_neon_mode0(cospi[32], cospi[32], x2[10], x2[13], x3[10], x3[13],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[32], cospi[32], x2[11], x2[12], x3[11], x3[12],
-                    v_cos_bit);
-
-  x3[16] = vqaddq_s16(x1[16], x2[23]);
-  x3[23] = vqsubq_s16(x1[16], x2[23]);
-  x3[17] = vqaddq_s16(x1[17], x2[22]);
-  x3[22] = vqsubq_s16(x1[17], x2[22]);
-  x3[18] = vqaddq_s16(x1[18], x2[21]);
-  x3[21] = vqsubq_s16(x1[18], x2[21]);
-  x3[19] = vqaddq_s16(x1[19], x2[20]);
-  x3[20] = vqsubq_s16(x1[19], x2[20]);
-  x3[24] = vqsubq_s16(x1[31], x2[24]);
-  x3[31] = vqaddq_s16(x1[31], x2[24]);
-  x3[25] = vqsubq_s16(x1[30], x2[25]);
-  x3[30] = vqaddq_s16(x1[30], x2[25]);
-  x3[26] = vqsubq_s16(x1[29], x2[26]);
-  x3[29] = vqaddq_s16(x1[29], x2[26]);
-  x3[27] = vqsubq_s16(x1[28], x2[27]);
-  x3[28] = vqaddq_s16(x1[28], x2[27]);
+  butterfly_dct_pre_s16_x8(x2, x3, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
+  butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
 
   // stage 4
   int16x8_t x4[32];
-  x4[0] = vqaddq_s16(x3[0], x3[3]);
-  x4[3] = vqsubq_s16(x3[0], x3[3]);
-  x4[1] = vqaddq_s16(x3[1], x3[2]);
-  x4[2] = vqsubq_s16(x3[1], x3[2]);
-  btf_16_neon_mode0(cospi[32], cospi[32], x3[5], x3[6], x4[5], x4[6],
-                    v_cos_bit);
-  x4[8] = vqaddq_s16(x2[8], x3[11]);
-  x4[11] = vqsubq_s16(x2[8], x3[11]);
-  x4[9] = vqaddq_s16(x2[9], x3[10]);
-  x4[10] = vqsubq_s16(x2[9], x3[10]);
-  x4[12] = vqsubq_s16(x2[15], x3[12]);
-  x4[15] = vqaddq_s16(x2[15], x3[12]);
-  x4[13] = vqsubq_s16(x2[14], x3[13]);
-  x4[14] = vqaddq_s16(x2[14], x3[13]);
-
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[18], x3[29], x4[18], x4[29],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[19], x3[28], x4[19], x4[28],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[20], x3[27], x4[20], x4[27],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[21], x3[26], x4[21], x4[26],
-                     v_cos_bit);
+  butterfly_dct_pre_s16_x8(x3, x4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
+  butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
 
   // stage 5
   int16x8_t x5[32];
-  btf_16_neon_mode3(cospi[32], cospi[32], x4[0], x4[1], output[0], output[16],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[48], cospi[16], x4[2], x4[3], output[8], output[24],
-                    v_cos_bit);
-  x5[4] = vqaddq_s16(x3[4], x4[5]);
-  x5[5] = vqsubq_s16(x3[4], x4[5]);
-  x5[6] = vqsubq_s16(x3[7], x4[6]);
-  x5[7] = vqaddq_s16(x3[7], x4[6]);
-
-  btf_16_neon_mode0(cospi[16], cospi[48], x4[9], x4[14], x5[9], x5[14],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x4[10], x4[13], x5[10], x5[13],
-                     v_cos_bit);
-
-  x5[16] = vqaddq_s16(x3[16], x4[19]);
-  x5[19] = vqsubq_s16(x3[16], x4[19]);
-  x5[17] = vqaddq_s16(x3[17], x4[18]);
-  x5[18] = vqsubq_s16(x3[17], x4[18]);
-  x5[20] = vqsubq_s16(x3[23], x4[20]);
-  x5[23] = vqaddq_s16(x3[23], x4[20]);
-  x5[21] = vqsubq_s16(x3[22], x4[21]);
-  x5[22] = vqaddq_s16(x3[22], x4[21]);
-  x5[24] = vqaddq_s16(x3[24], x4[27]);
-  x5[27] = vqsubq_s16(x3[24], x4[27]);
-  x5[25] = vqaddq_s16(x3[25], x4[26]);
-  x5[26] = vqsubq_s16(x3[25], x4[26]);
-  x5[28] = vqsubq_s16(x3[31], x4[28]);
-  x5[31] = vqaddq_s16(x3[31], x4[28]);
-  x5[29] = vqsubq_s16(x3[30], x4[29]);
-  x5[30] = vqaddq_s16(x3[30], x4[29]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
+                                 &output[16]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
+                                 &output[24]);
+  butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
+  butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
+  butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
 
   // stage 6
   int16x8_t x6[32];
-  btf_16_neon_mode2(cospi[56], cospi[8], x5[4], x5[7], output[4], output[28],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[24], cospi[40], x5[5], x5[6], output[20], output[12],
-                    v_cos_bit);
-  x6[8] = vqaddq_s16(x4[8], x5[9]);
-  x6[9] = vqsubq_s16(x4[8], x5[9]);
-  x6[10] = vqsubq_s16(x4[11], x5[10]);
-  x6[11] = vqaddq_s16(x4[11], x5[10]);
-  x6[12] = vqaddq_s16(x4[12], x5[13]);
-  x6[13] = vqsubq_s16(x4[12], x5[13]);
-  x6[14] = vqsubq_s16(x4[15], x5[14]);
-  x6[15] = vqaddq_s16(x4[15], x5[14]);
-  btf_16_neon_mode0(cospi[8], cospi[56], x5[17], x5[30], x6[17], x6[30],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[56], cospi[8], x5[18], x5[29], x6[18], x6[29],
-                     v_cos_bit);
-  btf_16_neon_mode0(cospi[40], cospi[24], x5[21], x5[26], x6[21], x6[26],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[24], cospi[40], x5[22], x5[25], x6[22], x6[25],
-                     v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
+                                 &output[12]);
+  butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
+  butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
 
   // stage 7
   int16x8_t x7[32];
-  btf_16_neon_mode2(cospi[60], cospi[4], x6[8], x6[15], output[2], output[30],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[28], cospi[36], x6[9], x6[14], output[18], output[14],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[44], cospi[20], x6[10], x6[13], output[10],
-                    output[22], v_cos_bit);
-  btf_16_neon_mode2(cospi[12], cospi[52], x6[11], x6[12], output[26], output[6],
-                    v_cos_bit);
-  x7[16] = vqaddq_s16(x5[16], x6[17]);
-  x7[17] = vqsubq_s16(x5[16], x6[17]);
-  x7[18] = vqsubq_s16(x5[19], x6[18]);
-  x7[19] = vqaddq_s16(x5[19], x6[18]);
-  x7[20] = vqaddq_s16(x5[20], x6[21]);
-  x7[21] = vqsubq_s16(x5[20], x6[21]);
-  x7[22] = vqsubq_s16(x5[23], x6[22]);
-  x7[23] = vqaddq_s16(x5[23], x6[22]);
-  x7[24] = vqaddq_s16(x5[24], x6[25]);
-  x7[25] = vqsubq_s16(x5[24], x6[25]);
-  x7[26] = vqsubq_s16(x5[27], x6[26]);
-  x7[27] = vqaddq_s16(x5[27], x6[26]);
-  x7[28] = vqaddq_s16(x5[28], x6[29]);
-  x7[29] = vqsubq_s16(x5[28], x6[29]);
-  x7[30] = vqsubq_s16(x5[31], x6[30]);
-  x7[31] = vqaddq_s16(x5[31], x6[30]);
+  butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
+                                 &output[30]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
+                                 &output[14]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
+                                 &output[22]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
+                                 &output[6]);
+  butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
+  butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
+  butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
+  butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
 
-  btf_16_neon_mode2(cospi[62], cospi[2], x7[16], x7[31], output[1], output[31],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[30], cospi[34], x7[17], x7[30], output[17],
-                    output[15], v_cos_bit);
-  btf_16_neon_mode2(cospi[46], cospi[18], x7[18], x7[29], output[9], output[23],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[14], cospi[50], x7[19], x7[28], output[25], output[7],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[54], cospi[10], x7[20], x7[27], output[5], output[27],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[22], cospi[42], x7[21], x7[26], output[21],
-                    output[11], v_cos_bit);
-  btf_16_neon_mode2(cospi[38], cospi[26], x7[22], x7[25], output[13],
-                    output[19], v_cos_bit);
-  btf_16_neon_mode2(cospi[6], cospi[58], x7[23], x7[24], output[29], output[3],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
+                                 &output[31]);
+  butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
+                                 &output[15]);
+  butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
+                                 &output[23]);
+  butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
+                                 &output[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
+                                 &output[27]);
+  butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
+                                 &output[11]);
+  butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
+                                 &output[19]);
+  butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
+                                 &output[3]);
 }
 
-void av1_fdct8x64_stage_1234_neon(const int16x8_t *input, int16x8_t *x3,
-                                  int16x8_t *x4, const int32_t *cospi32,
-                                  const int32x4_t *v_cos_bit) {
+static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+  // stage 1
   int16x8_t x1[64];
+  butterfly_dct_pre_s16_x8(input, x1, 64);
+
+  // stage 2
   int16x8_t x2[64];
-  x1[0] = vqaddq_s16(input[0], input[63]);
-  x1[63] = vqsubq_s16(input[0], input[63]);
-  x1[1] = vqaddq_s16(input[1], input[62]);
-  x1[62] = vqsubq_s16(input[1], input[62]);
-  x1[2] = vqaddq_s16(input[2], input[61]);
-  x1[61] = vqsubq_s16(input[2], input[61]);
-  x1[3] = vqaddq_s16(input[3], input[60]);
-  x1[60] = vqsubq_s16(input[3], input[60]);
-  x1[4] = vqaddq_s16(input[4], input[59]);
-  x1[59] = vqsubq_s16(input[4], input[59]);
-  x1[5] = vqaddq_s16(input[5], input[58]);
-  x1[58] = vqsubq_s16(input[5], input[58]);
-  x1[6] = vqaddq_s16(input[6], input[57]);
-  x1[57] = vqsubq_s16(input[6], input[57]);
-  x1[7] = vqaddq_s16(input[7], input[56]);
-  x1[56] = vqsubq_s16(input[7], input[56]);
-  x1[8] = vqaddq_s16(input[8], input[55]);
-  x1[55] = vqsubq_s16(input[8], input[55]);
-  x1[9] = vqaddq_s16(input[9], input[54]);
-  x1[54] = vqsubq_s16(input[9], input[54]);
-  x1[10] = vqaddq_s16(input[10], input[53]);
-  x1[53] = vqsubq_s16(input[10], input[53]);
-  x1[11] = vqaddq_s16(input[11], input[52]);
-  x1[52] = vqsubq_s16(input[11], input[52]);
-  x1[12] = vqaddq_s16(input[12], input[51]);
-  x1[51] = vqsubq_s16(input[12], input[51]);
-  x1[13] = vqaddq_s16(input[13], input[50]);
-  x1[50] = vqsubq_s16(input[13], input[50]);
-  x1[14] = vqaddq_s16(input[14], input[49]);
-  x1[49] = vqsubq_s16(input[14], input[49]);
-  x1[15] = vqaddq_s16(input[15], input[48]);
-  x1[48] = vqsubq_s16(input[15], input[48]);
-  x1[16] = vqaddq_s16(input[16], input[47]);
-  x1[47] = vqsubq_s16(input[16], input[47]);
-  x1[17] = vqaddq_s16(input[17], input[46]);
-  x1[46] = vqsubq_s16(input[17], input[46]);
-  x1[18] = vqaddq_s16(input[18], input[45]);
-  x1[45] = vqsubq_s16(input[18], input[45]);
-  x1[19] = vqaddq_s16(input[19], input[44]);
-  x1[44] = vqsubq_s16(input[19], input[44]);
-  x1[20] = vqaddq_s16(input[20], input[43]);
-  x1[43] = vqsubq_s16(input[20], input[43]);
-  x1[21] = vqaddq_s16(input[21], input[42]);
-  x1[42] = vqsubq_s16(input[21], input[42]);
-  x1[22] = vqaddq_s16(input[22], input[41]);
-  x1[41] = vqsubq_s16(input[22], input[41]);
-  x1[23] = vqaddq_s16(input[23], input[40]);
-  x1[40] = vqsubq_s16(input[23], input[40]);
-  x1[24] = vqaddq_s16(input[24], input[39]);
-  x1[39] = vqsubq_s16(input[24], input[39]);
-  x1[25] = vqaddq_s16(input[25], input[38]);
-  x1[38] = vqsubq_s16(input[25], input[38]);
-  x1[26] = vqaddq_s16(input[26], input[37]);
-  x1[37] = vqsubq_s16(input[26], input[37]);
-  x1[27] = vqaddq_s16(input[27], input[36]);
-  x1[36] = vqsubq_s16(input[27], input[36]);
-  x1[28] = vqaddq_s16(input[28], input[35]);
-  x1[35] = vqsubq_s16(input[28], input[35]);
-  x1[29] = vqaddq_s16(input[29], input[34]);
-  x1[34] = vqsubq_s16(input[29], input[34]);
-  x1[30] = vqaddq_s16(input[30], input[33]);
-  x1[33] = vqsubq_s16(input[30], input[33]);
-  x1[31] = vqaddq_s16(input[31], input[32]);
-  x1[32] = vqsubq_s16(input[31], input[32]);
-
-  x2[0] = vqaddq_s16(x1[0], x1[31]);
-  x2[31] = vqsubq_s16(x1[0], x1[31]);
-  x2[1] = vqaddq_s16(x1[1], x1[30]);
-  x2[30] = vqsubq_s16(x1[1], x1[30]);
-  x2[2] = vqaddq_s16(x1[2], x1[29]);
-  x2[29] = vqsubq_s16(x1[2], x1[29]);
-  x2[3] = vqaddq_s16(x1[3], x1[28]);
-  x2[28] = vqsubq_s16(x1[3], x1[28]);
-  x2[4] = vqaddq_s16(x1[4], x1[27]);
-  x2[27] = vqsubq_s16(x1[4], x1[27]);
-  x2[5] = vqaddq_s16(x1[5], x1[26]);
-  x2[26] = vqsubq_s16(x1[5], x1[26]);
-  x2[6] = vqaddq_s16(x1[6], x1[25]);
-  x2[25] = vqsubq_s16(x1[6], x1[25]);
-  x2[7] = vqaddq_s16(x1[7], x1[24]);
-  x2[24] = vqsubq_s16(x1[7], x1[24]);
-  x2[8] = vqaddq_s16(x1[8], x1[23]);
-  x2[23] = vqsubq_s16(x1[8], x1[23]);
-  x2[9] = vqaddq_s16(x1[9], x1[22]);
-  x2[22] = vqsubq_s16(x1[9], x1[22]);
-  x2[10] = vqaddq_s16(x1[10], x1[21]);
-  x2[21] = vqsubq_s16(x1[10], x1[21]);
-  x2[11] = vqaddq_s16(x1[11], x1[20]);
-  x2[20] = vqsubq_s16(x1[11], x1[20]);
-  x2[12] = vqaddq_s16(x1[12], x1[19]);
-  x2[19] = vqsubq_s16(x1[12], x1[19]);
-  x2[13] = vqaddq_s16(x1[13], x1[18]);
-  x2[18] = vqsubq_s16(x1[13], x1[18]);
-  x2[14] = vqaddq_s16(x1[14], x1[17]);
-  x2[17] = vqsubq_s16(x1[14], x1[17]);
-  x2[15] = vqaddq_s16(x1[15], x1[16]);
-  x2[16] = vqsubq_s16(x1[15], x1[16]);
-
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[40], x1[55], x2[40], x2[55],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[41], x1[54], x2[41], x2[54],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[42], x1[53], x2[42], x2[53],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[43], x1[52], x2[43], x2[52],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[44], x1[51], x2[44], x2[51],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[45], x1[50], x2[45], x2[50],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[46], x1[49], x2[46], x2[49],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x1[47], x1[48], x2[47], x2[48],
-                    *v_cos_bit);
+  butterfly_dct_pre_s16_x8(x1, x2, 32);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
 
   // stage 3
-  x3[0] = vqaddq_s16(x2[0], x2[15]);
-  x3[15] = vqsubq_s16(x2[0], x2[15]);
-  x3[1] = vqaddq_s16(x2[1], x2[14]);
-  x3[14] = vqsubq_s16(x2[1], x2[14]);
-  x3[2] = vqaddq_s16(x2[2], x2[13]);
-  x3[13] = vqsubq_s16(x2[2], x2[13]);
-  x3[3] = vqaddq_s16(x2[3], x2[12]);
-  x3[12] = vqsubq_s16(x2[3], x2[12]);
-  x3[4] = vqaddq_s16(x2[4], x2[11]);
-  x3[11] = vqsubq_s16(x2[4], x2[11]);
-  x3[5] = vqaddq_s16(x2[5], x2[10]);
-  x3[10] = vqsubq_s16(x2[5], x2[10]);
-  x3[6] = vqaddq_s16(x2[6], x2[9]);
-  x3[9] = vqsubq_s16(x2[6], x2[9]);
-  x3[7] = vqaddq_s16(x2[7], x2[8]);
-  x3[8] = vqsubq_s16(x2[7], x2[8]);
+  int16x8_t x3[64];
+  butterfly_dct_pre_s16_x8(x2, x3, 16);
   x3[16] = x2[16];
   x3[17] = x2[17];
   x3[18] = x2[18];
   x3[19] = x2[19];
-  btf_16_neon_mode0(*cospi32, *cospi32, x2[20], x2[27], x3[20], x3[27],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x2[21], x2[26], x3[21], x3[26],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x2[22], x2[25], x3[22], x3[25],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x2[23], x2[24], x3[23], x3[24],
-                    *v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
   x3[28] = x2[28];
   x3[29] = x2[29];
   x3[30] = x2[30];
   x3[31] = x2[31];
-  x3[32] = vqaddq_s16(x1[32], x2[47]);
-  x3[47] = vqsubq_s16(x1[32], x2[47]);
-  x3[33] = vqaddq_s16(x1[33], x2[46]);
-  x3[46] = vqsubq_s16(x1[33], x2[46]);
-  x3[34] = vqaddq_s16(x1[34], x2[45]);
-  x3[45] = vqsubq_s16(x1[34], x2[45]);
-  x3[35] = vqaddq_s16(x1[35], x2[44]);
-  x3[44] = vqsubq_s16(x1[35], x2[44]);
-  x3[36] = vqaddq_s16(x1[36], x2[43]);
-  x3[43] = vqsubq_s16(x1[36], x2[43]);
-  x3[37] = vqaddq_s16(x1[37], x2[42]);
-  x3[42] = vqsubq_s16(x1[37], x2[42]);
-  x3[38] = vqaddq_s16(x1[38], x2[41]);
-  x3[41] = vqsubq_s16(x1[38], x2[41]);
-  x3[39] = vqaddq_s16(x1[39], x2[40]);
-  x3[40] = vqsubq_s16(x1[39], x2[40]);
-  x3[48] = vqsubq_s16(x1[63], x2[48]);
-  x3[63] = vqaddq_s16(x1[63], x2[48]);
-  x3[49] = vqsubq_s16(x1[62], x2[49]);
-  x3[62] = vqaddq_s16(x1[62], x2[49]);
-  x3[50] = vqsubq_s16(x1[61], x2[50]);
-  x3[61] = vqaddq_s16(x1[61], x2[50]);
-  x3[51] = vqsubq_s16(x1[60], x2[51]);
-  x3[60] = vqaddq_s16(x1[60], x2[51]);
-  x3[52] = vqsubq_s16(x1[59], x2[52]);
-  x3[59] = vqaddq_s16(x1[59], x2[52]);
-  x3[53] = vqsubq_s16(x1[58], x2[53]);
-  x3[58] = vqaddq_s16(x1[58], x2[53]);
-  x3[54] = vqsubq_s16(x1[57], x2[54]);
-  x3[57] = vqaddq_s16(x1[57], x2[54]);
-  x3[55] = vqsubq_s16(x1[56], x2[55]);
-  x3[56] = vqaddq_s16(x1[56], x2[55]);
+  butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
 
   // stage 4
-  x4[0] = vqaddq_s16(x3[0], x3[7]);
-  x4[7] = vqsubq_s16(x3[0], x3[7]);
-  x4[1] = vqaddq_s16(x3[1], x3[6]);
-  x4[6] = vqsubq_s16(x3[1], x3[6]);
-  x4[2] = vqaddq_s16(x3[2], x3[5]);
-  x4[5] = vqsubq_s16(x3[2], x3[5]);
-  x4[3] = vqaddq_s16(x3[3], x3[4]);
-  x4[4] = vqsubq_s16(x3[3], x3[4]);
-
-  btf_16_neon_mode0(*cospi32, *cospi32, x3[10], x3[13], x4[10], x4[13],
-                    *v_cos_bit);
-  btf_16_neon_mode0(*cospi32, *cospi32, x3[11], x3[12], x4[11], x4[12],
-                    *v_cos_bit);
-
-  x4[16] = vqaddq_s16(x3[16], x3[23]);
-  x4[23] = vqsubq_s16(x3[16], x3[23]);
-  x4[17] = vqaddq_s16(x3[17], x3[22]);
-  x4[22] = vqsubq_s16(x3[17], x3[22]);
-  x4[18] = vqaddq_s16(x3[18], x3[21]);
-  x4[21] = vqsubq_s16(x3[18], x3[21]);
-  x4[19] = vqaddq_s16(x3[19], x3[20]);
-  x4[20] = vqsubq_s16(x3[19], x3[20]);
-  x4[24] = vqsubq_s16(x3[31], x3[24]);
-  x4[31] = vqaddq_s16(x3[31], x3[24]);
-  x4[25] = vqsubq_s16(x3[30], x3[25]);
-  x4[30] = vqaddq_s16(x3[30], x3[25]);
-  x4[26] = vqsubq_s16(x3[29], x3[26]);
-  x4[29] = vqaddq_s16(x3[29], x3[26]);
-  x4[27] = vqsubq_s16(x3[28], x3[27]);
-  x4[28] = vqaddq_s16(x3[28], x3[27]);
-}
-
-void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
-                       int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
-  int16x8_t x3[64];
   int16x8_t x4[64];
-
-  av1_fdct8x64_stage_1234_neon(input, x3, x4, &cospi[32], &v_cos_bit);
-
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
-                     v_cos_bit);
+  butterfly_dct_pre_s16_x8(x3, x4, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+  butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
 
   // stage 5
   int16x8_t x5[64];
-  x5[0] = vqaddq_s16(x4[0], x4[3]);
-  x5[3] = vqsubq_s16(x4[0], x4[3]);
-  x5[1] = vqaddq_s16(x4[1], x4[2]);
-  x5[2] = vqsubq_s16(x4[1], x4[2]);
-
-  btf_16_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
-                    v_cos_bit);
-
-  x5[8] = vqaddq_s16(x3[8], x4[11]);
-  x5[11] = vqsubq_s16(x3[8], x4[11]);
-  x5[9] = vqaddq_s16(x3[9], x4[10]);
-  x5[10] = vqsubq_s16(x3[9], x4[10]);
-  x5[12] = vqsubq_s16(x3[15], x4[12]);
-  x5[15] = vqaddq_s16(x3[15], x4[12]);
-  x5[13] = vqsubq_s16(x3[14], x4[13]);
-  x5[14] = vqaddq_s16(x3[14], x4[13]);
-
-  btf_16_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
-                     v_cos_bit);
-
-  x5[32] = vqaddq_s16(x3[32], x4[39]);
-  x5[39] = vqsubq_s16(x3[32], x4[39]);
-  x5[33] = vqaddq_s16(x3[33], x4[38]);
-  x5[38] = vqsubq_s16(x3[33], x4[38]);
-  x5[34] = vqaddq_s16(x3[34], x4[37]);
-  x5[37] = vqsubq_s16(x3[34], x4[37]);
-  x5[35] = vqaddq_s16(x3[35], x4[36]);
-  x5[36] = vqsubq_s16(x3[35], x4[36]);
-  x5[40] = vqsubq_s16(x3[47], x4[40]);
-  x5[47] = vqaddq_s16(x3[47], x4[40]);
-  x5[41] = vqsubq_s16(x3[46], x4[41]);
-  x5[46] = vqaddq_s16(x3[46], x4[41]);
-  x5[42] = vqsubq_s16(x3[45], x4[42]);
-  x5[45] = vqaddq_s16(x3[45], x4[42]);
-  x5[43] = vqsubq_s16(x3[44], x4[43]);
-  x5[44] = vqaddq_s16(x3[44], x4[43]);
-  x5[48] = vqaddq_s16(x3[48], x4[55]);
-  x5[55] = vqsubq_s16(x3[48], x4[55]);
-  x5[49] = vqaddq_s16(x3[49], x4[54]);
-  x5[54] = vqsubq_s16(x3[49], x4[54]);
-  x5[50] = vqaddq_s16(x3[50], x4[53]);
-  x5[53] = vqsubq_s16(x3[50], x4[53]);
-  x5[51] = vqaddq_s16(x3[51], x4[52]);
-  x5[52] = vqsubq_s16(x3[51], x4[52]);
-  x5[56] = vqsubq_s16(x3[63], x4[56]);
-  x5[63] = vqaddq_s16(x3[63], x4[56]);
-  x5[57] = vqsubq_s16(x3[62], x4[57]);
-  x5[62] = vqaddq_s16(x3[62], x4[57]);
-  x5[58] = vqsubq_s16(x3[61], x4[58]);
-  x5[61] = vqaddq_s16(x3[61], x4[58]);
-  x5[59] = vqsubq_s16(x3[60], x4[59]);
-  x5[60] = vqaddq_s16(x3[60], x4[59]);
+  butterfly_dct_pre_s16_x8(x4, x5, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+  butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+  butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
+  butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
 
   // stage 6
   int16x8_t x6[64];
-  btf_16_neon_mode2(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
-                    v_cos_bit);
-  x6[4] = vqaddq_s16(x4[4], x5[5]);
-  x6[5] = vqsubq_s16(x4[4], x5[5]);
-  x6[6] = vqsubq_s16(x4[7], x5[6]);
-  x6[7] = vqaddq_s16(x4[7], x5[6]);
-
-  btf_16_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
-                     v_cos_bit);
-
-  x6[16] = vqaddq_s16(x4[16], x5[19]);
-  x6[19] = vqsubq_s16(x4[16], x5[19]);
-  x6[17] = vqaddq_s16(x4[17], x5[18]);
-  x6[18] = vqsubq_s16(x4[17], x5[18]);
-  x6[20] = vqsubq_s16(x4[23], x5[20]);
-  x6[23] = vqaddq_s16(x4[23], x5[20]);
-  x6[21] = vqsubq_s16(x4[22], x5[21]);
-  x6[22] = vqaddq_s16(x4[22], x5[21]);
-  x6[24] = vqaddq_s16(x4[24], x5[27]);
-  x6[27] = vqsubq_s16(x4[24], x5[27]);
-  x6[25] = vqaddq_s16(x4[25], x5[26]);
-  x6[26] = vqsubq_s16(x4[25], x5[26]);
-  x6[28] = vqsubq_s16(x4[31], x5[28]);
-  x6[31] = vqaddq_s16(x4[31], x5[28]);
-  x6[29] = vqsubq_s16(x4[30], x5[29]);
-  x6[30] = vqaddq_s16(x4[30], x5[29]);
-
-  btf_16_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
-                     v_cos_bit);
-  btf_16_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
-                     v_cos_bit);
-  btf_16_neon_mode02(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
-                     v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+  butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+  butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+  butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
+  butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
 
   // stage 7
   int16x8_t x7[64];
-
-  btf_16_neon_mode2(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
-  btf_16_neon_mode2(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
-                    v_cos_bit);
-  x7[8] = vqaddq_s16(x5[8], x6[9]);
-  x7[9] = vqsubq_s16(x5[8], x6[9]);
-  x7[10] = vqsubq_s16(x5[11], x6[10]);
-  x7[11] = vqaddq_s16(x5[11], x6[10]);
-  x7[12] = vqaddq_s16(x5[12], x6[13]);
-  x7[13] = vqsubq_s16(x5[12], x6[13]);
-  x7[14] = vqsubq_s16(x5[15], x6[14]);
-  x7[15] = vqaddq_s16(x5[15], x6[14]);
-
-  btf_16_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
-                     v_cos_bit);
-
-  btf_16_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
-                     v_cos_bit);
-
-  x7[32] = vqaddq_s16(x5[32], x6[35]);
-  x7[35] = vqsubq_s16(x5[32], x6[35]);
-  x7[33] = vqaddq_s16(x5[33], x6[34]);
-  x7[34] = vqsubq_s16(x5[33], x6[34]);
-  x7[36] = vqsubq_s16(x5[39], x6[36]);
-  x7[39] = vqaddq_s16(x5[39], x6[36]);
-  x7[37] = vqsubq_s16(x5[38], x6[37]);
-  x7[38] = vqaddq_s16(x5[38], x6[37]);
-  x7[40] = vqaddq_s16(x5[40], x6[43]);
-  x7[43] = vqsubq_s16(x5[40], x6[43]);
-  x7[41] = vqaddq_s16(x5[41], x6[42]);
-  x7[42] = vqsubq_s16(x5[41], x6[42]);
-  x7[44] = vqsubq_s16(x5[47], x6[44]);
-  x7[47] = vqaddq_s16(x5[47], x6[44]);
-  x7[45] = vqsubq_s16(x5[46], x6[45]);
-  x7[46] = vqaddq_s16(x5[46], x6[45]);
-  x7[48] = vqaddq_s16(x5[48], x6[51]);
-  x7[51] = vqsubq_s16(x5[48], x6[51]);
-  x7[49] = vqaddq_s16(x5[49], x6[50]);
-  x7[50] = vqsubq_s16(x5[49], x6[50]);
-  x7[52] = vqsubq_s16(x5[55], x6[52]);
-  x7[55] = vqaddq_s16(x5[55], x6[52]);
-  x7[53] = vqsubq_s16(x5[54], x6[53]);
-  x7[54] = vqaddq_s16(x5[54], x6[53]);
-  x7[56] = vqaddq_s16(x5[56], x6[59]);
-  x7[59] = vqsubq_s16(x5[56], x6[59]);
-  x7[57] = vqaddq_s16(x5[57], x6[58]);
-  x7[58] = vqsubq_s16(x5[57], x6[58]);
-  x7[60] = vqsubq_s16(x5[63], x6[60]);
-  x7[63] = vqaddq_s16(x5[63], x6[60]);
-  x7[61] = vqsubq_s16(x5[62], x6[61]);
-  x7[62] = vqaddq_s16(x5[62], x6[61]);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+  butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
+  butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+  butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+  butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+  butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
+  butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
+  butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
+  butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
 
   // stage 8
   int16x8_t x8[64];
-
-  btf_16_neon_mode2(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
-                    v_cos_bit);
-  x8[16] = vqaddq_s16(x6[16], x7[17]);
-  x8[17] = vqsubq_s16(x6[16], x7[17]);
-  x8[18] = vqsubq_s16(x6[19], x7[18]);
-  x8[19] = vqaddq_s16(x6[19], x7[18]);
-  x8[20] = vqaddq_s16(x6[20], x7[21]);
-  x8[21] = vqsubq_s16(x6[20], x7[21]);
-  x8[22] = vqsubq_s16(x6[23], x7[22]);
-  x8[23] = vqaddq_s16(x6[23], x7[22]);
-  x8[24] = vqaddq_s16(x6[24], x7[25]);
-  x8[25] = vqsubq_s16(x6[24], x7[25]);
-  x8[26] = vqsubq_s16(x6[27], x7[26]);
-  x8[27] = vqaddq_s16(x6[27], x7[26]);
-  x8[28] = vqaddq_s16(x6[28], x7[29]);
-  x8[29] = vqsubq_s16(x6[28], x7[29]);
-  x8[30] = vqsubq_s16(x6[31], x7[30]);
-  x8[31] = vqaddq_s16(x6[31], x7[30]);
-
-  btf_16_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
-                     v_cos_bit);
-  btf_16_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
-                     v_cos_bit);
-  btf_16_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
-                     v_cos_bit);
-  btf_16_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
-                    v_cos_bit);
-  btf_16_neon_mode02(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
-                     v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+  butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
+  butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
+  butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
+  butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
+  butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+  butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+  butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+  butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+  butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+  butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
 
   // stage 9
   int16x8_t x9[64];
-
-  btf_16_neon_mode2(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
-                    v_cos_bit);
-  btf_16_neon_mode2(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
-                    v_cos_bit);
-  x9[32] = vqaddq_s16(x7[32], x8[33]);
-  x9[33] = vqsubq_s16(x7[32], x8[33]);
-  x9[34] = vqsubq_s16(x7[35], x8[34]);
-  x9[35] = vqaddq_s16(x7[35], x8[34]);
-  x9[36] = vqaddq_s16(x7[36], x8[37]);
-  x9[37] = vqsubq_s16(x7[36], x8[37]);
-  x9[38] = vqsubq_s16(x7[39], x8[38]);
-  x9[39] = vqaddq_s16(x7[39], x8[38]);
-  x9[40] = vqaddq_s16(x7[40], x8[41]);
-  x9[41] = vqsubq_s16(x7[40], x8[41]);
-  x9[42] = vqsubq_s16(x7[43], x8[42]);
-  x9[43] = vqaddq_s16(x7[43], x8[42]);
-  x9[44] = vqaddq_s16(x7[44], x8[45]);
-  x9[45] = vqsubq_s16(x7[44], x8[45]);
-  x9[46] = vqsubq_s16(x7[47], x8[46]);
-  x9[47] = vqaddq_s16(x7[47], x8[46]);
-  x9[48] = vqaddq_s16(x7[48], x8[49]);
-  x9[49] = vqsubq_s16(x7[48], x8[49]);
-  x9[50] = vqsubq_s16(x7[51], x8[50]);
-  x9[51] = vqaddq_s16(x7[51], x8[50]);
-  x9[52] = vqaddq_s16(x7[52], x8[53]);
-  x9[53] = vqsubq_s16(x7[52], x8[53]);
-  x9[54] = vqsubq_s16(x7[55], x8[54]);
-  x9[55] = vqaddq_s16(x7[55], x8[54]);
-  x9[56] = vqaddq_s16(x7[56], x8[57]);
-  x9[57] = vqsubq_s16(x7[56], x8[57]);
-  x9[58] = vqsubq_s16(x7[59], x8[58]);
-  x9[59] = vqaddq_s16(x7[59], x8[58]);
-  x9[60] = vqaddq_s16(x7[60], x8[61]);
-  x9[61] = vqsubq_s16(x7[60], x8[61]);
-  x9[62] = vqsubq_s16(x7[63], x8[62]);
-  x9[63] = vqaddq_s16(x7[63], x8[62]);
+  butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+  butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+  butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+  butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+  butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+  butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+  butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+  butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+  butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
+  butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
+  butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
+  butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
+  butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
+  butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
+  butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
+  butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
 
   // stage 10
-  btf_16_neon_mode2(cospi[63], cospi[1], x9[32], x9[63], output[1], output[63],
-                    v_cos_bit);
-
-  btf_16_neon_mode2(cospi[31], cospi[33], x9[33], x9[62], output[33],
-                    output[31], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[47], cospi[17], x9[34], x9[61], output[17],
-                    output[47], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[15], cospi[49], x9[35], x9[60], output[49],
-                    output[15], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[55], cospi[9], x9[36], x9[59], output[9], output[55],
-                    v_cos_bit);
-
-  btf_16_neon_mode2(cospi[23], cospi[41], x9[37], x9[58], output[41],
-                    output[23], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[39], cospi[25], x9[38], x9[57], output[25],
-                    output[39], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[7], cospi[57], x9[39], x9[56], output[57], output[7],
-                    v_cos_bit);
-
-  btf_16_neon_mode2(cospi[59], cospi[5], x9[40], x9[55], output[5], output[59],
-                    v_cos_bit);
-
-  btf_16_neon_mode2(cospi[27], cospi[37], x9[41], x9[54], output[37],
-                    output[27], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[43], cospi[21], x9[42], x9[53], output[21],
-                    output[43], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[11], cospi[53], x9[43], x9[52], output[53],
-                    output[11], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[51], cospi[13], x9[44], x9[51], output[13],
-                    output[51], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[19], cospi[45], x9[45], x9[50], output[45],
-                    output[19], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[35], cospi[29], x9[46], x9[49], output[29],
-                    output[35], v_cos_bit);
-
-  btf_16_neon_mode2(cospi[3], cospi[61], x9[47], x9[48], output[61], output[3],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
+                                 &output[63]);
+  butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
+                                 &output[31]);
+  butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
+                                 &output[47]);
+  butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
+                                 &output[15]);
+  butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
+                                 &output[55]);
+  butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
+                                 &output[23]);
+  butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
+                                 &output[39]);
+  butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
+                                 &output[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
+                                 &output[59]);
+  butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
+                                 &output[27]);
+  butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
+                                 &output[43]);
+  butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
+                                 &output[11]);
+  butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
+                                 &output[51]);
+  butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
+                                 &output[19]);
+  butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
+                                 &output[35]);
+  butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
+                                 &output[3]);
 
   // stage 11
   output[0] = x6[0];
@@ -1823,1377 +1168,1297 @@
   output[62] = x9[31];
 }
 
-void fadst_8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
 
-  // stage 1
-  int16x8_t x1[4];
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
 
-  x1[0] = vqnegq_s16(input[7]);
-  x1[1] = vqnegq_s16(input[3]);
-  x1[2] = vqnegq_s16(input[1]);
-  x1[3] = vqnegq_s16(input[5]);
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 2
   int16x8_t x2[8];
+  butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
 
-  btf_16_neon_mode3(cospi[32], cospi[32], x1[1], input[4], x2[2], x2[3],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[32], cospi[32], input[2], x1[3], x2[6], x2[7],
-                    v_cos_bit);
   // stage 3
   int16x8_t x3[8];
   x3[0] = vqaddq_s16(input[0], x2[2]);
+  x3[1] = vqsubq_s16(x2[3], input[7]);
   x3[2] = vqsubq_s16(input[0], x2[2]);
-  x3[1] = vqaddq_s16(x1[0], x2[3]);
-  x3[3] = vqsubq_s16(x1[0], x2[3]);
-  x3[4] = vqaddq_s16(x1[2], x2[6]);
-  x3[6] = vqsubq_s16(x1[2], x2[6]);
+  x3[3] = vqaddq_s16(input[7], x2[3]);
+  x3[4] = vqsubq_s16(x2[6], input[1]);
   x3[5] = vqaddq_s16(input[6], x2[7]);
+  x3[6] = vqaddq_s16(input[1], x2[6]);
   x3[7] = vqsubq_s16(input[6], x2[7]);
 
   // stage 4
-  btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
 
   // stage 5
   int16x8_t x5[8];
   x5[0] = vqaddq_s16(x3[0], x3[4]);
-  x5[4] = vqsubq_s16(x3[0], x3[4]);
   x5[1] = vqaddq_s16(x3[1], x3[5]);
-  x5[5] = vqsubq_s16(x3[1], x3[5]);
   x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[3] = vqsubq_s16(x3[7], x3[3]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
   x5[6] = vqsubq_s16(x3[2], x3[6]);
-  x5[3] = vqaddq_s16(x3[3], x3[7]);
-  x5[7] = vqsubq_s16(x3[3], x3[7]);
+  x5[7] = vqaddq_s16(x3[3], x3[7]);
 
   // stage 6
-  btf_16_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+  butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+  butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+  butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
 }
 
-static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
-                           int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
+                                            int16x4_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
 
-  // stage 1
-  int16x8_t x1[12];
-  x1[0] = vqnegq_s16(input[15]);
-  x1[1] = vqnegq_s16(input[3]);
-  x1[2] = vqnegq_s16(input[1]);
-  x1[3] = vqnegq_s16(input[13]);
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
 
   // stage 2
-  btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[7], input[8],
-              x1[4], x1[5]);
-  btf_16_neon_mode1(cospi[32], cospi[32], input[4], input[11], x1[6], x1[7],
-                    v_cos_bit);
-  btf_16_neon_mode1(cospi[32], cospi[32], input[6], input[9], x1[8], x1[9],
-                    v_cos_bit);
-  btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[5],
-              input[10], x1[10], x1[11]);
+  int16x4_t x2[8];
+  butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+  butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+  butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
   // stage 3
-  int16x8_t x3[16];
-  x3[0] = vqaddq_s16(input[0], x1[4]);
-  x3[2] = vqsubq_s16(input[0], x1[4]);
-  x3[1] = vqaddq_s16(x1[0], x1[5]);
-  x3[3] = vqsubq_s16(x1[0], x1[5]);
-  x3[4] = vqaddq_s16(x1[1], x1[6]);
-  x3[6] = vqsubq_s16(x1[1], x1[6]);
-  x3[5] = vqaddq_s16(input[12], x1[7]);
-  x3[7] = vqsubq_s16(input[12], x1[7]);
-  x3[8] = vqaddq_s16(x1[2], x1[8]);
-  x3[10] = vqsubq_s16(x1[2], x1[8]);
-  x3[9] = vqaddq_s16(input[14], x1[9]);
-  x3[11] = vqsubq_s16(input[14], x1[9]);
-  x3[12] = vqaddq_s16(input[2], x1[10]);
-  x3[14] = vqsubq_s16(input[2], x1[10]);
-  x3[13] = vqaddq_s16(x1[3], x1[11]);
-  x3[15] = vqsubq_s16(x1[3], x1[11]);
+  int16x4_t x3[16];
+  x3[0] = vqadd_s16(input[0], x2[0]);
+  x3[1] = vqsub_s16(x2[1], input[15]);
+  x3[2] = vqsub_s16(input[0], x2[0]);
+  x3[3] = vqadd_s16(input[15], x2[1]);
+  x3[4] = vqsub_s16(x2[2], input[3]);
+  x3[5] = vqadd_s16(input[12], x2[3]);
+  x3[6] = vqadd_s16(input[3], x2[2]);
+  x3[7] = vqsub_s16(input[12], x2[3]);
+  x3[8] = vqsub_s16(x2[4], input[1]);
+  x3[9] = vqadd_s16(input[14], x2[5]);
+  x3[10] = vqadd_s16(input[1], x2[4]);
+  x3[11] = vqsub_s16(input[14], x2[5]);
+  x3[12] = vqadd_s16(input[2], x2[6]);
+  x3[13] = vqsub_s16(x2[7], input[13]);
+  x3[14] = vqsub_s16(input[2], x2[6]);
+  x3[15] = vqadd_s16(input[13], x2[7]);
 
   // stage 4
-  btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[16], cospi[48], x3[12], x3[13], x3[12], x3[13],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[48], cospi[16], x3[14], x3[15], x3[14], x3[15],
-                    v_cos_bit);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+  butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+  butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+  // stage 5
+  int16x4_t x5[16];
+  x5[0] = vqadd_s16(x3[0], x3[4]);
+  x5[1] = vqadd_s16(x3[1], x3[5]);
+  x5[2] = vqadd_s16(x3[2], x3[6]);
+  x5[3] = vqsub_s16(x3[7], x3[3]);
+  x5[4] = vqsub_s16(x3[0], x3[4]);
+  x5[5] = vqsub_s16(x3[1], x3[5]);
+  x5[6] = vqsub_s16(x3[2], x3[6]);
+  x5[7] = vqadd_s16(x3[3], x3[7]);
+  x5[8] = vqadd_s16(x3[8], x3[12]);
+  x5[9] = vqadd_s16(x3[9], x3[13]);
+  x5[10] = vqsub_s16(x3[14], x3[10]);
+  x5[11] = vqadd_s16(x3[11], x3[15]);
+  x5[12] = vqsub_s16(x3[8], x3[12]);
+  x5[13] = vqsub_s16(x3[9], x3[13]);
+  x5[14] = vqadd_s16(x3[10], x3[14]);
+  x5[15] = vqsub_s16(x3[11], x3[15]);
+
+  // stage 6
+  butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+  butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+  butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+  // stage 7
+  int16x4_t x7[16];
+  x7[0] = vqadd_s16(x5[0], x5[8]);
+  x7[1] = vqadd_s16(x5[1], x5[9]);
+  x7[2] = vqadd_s16(x5[2], x5[10]);
+  x7[3] = vqadd_s16(x5[3], x5[11]);
+  x7[4] = vqadd_s16(x5[4], x5[12]);
+  x7[5] = vqadd_s16(x5[5], x5[13]);
+  x7[6] = vqadd_s16(x5[6], x5[14]);
+  x7[7] = vqsub_s16(x5[15], x5[7]);
+  x7[8] = vqsub_s16(x5[0], x5[8]);
+  x7[9] = vqsub_s16(x5[1], x5[9]);
+  x7[10] = vqsub_s16(x5[2], x5[10]);
+  x7[11] = vqsub_s16(x5[3], x5[11]);
+  x7[12] = vqsub_s16(x5[4], x5[12]);
+  x7[13] = vqsub_s16(x5[5], x5[13]);
+  x7[14] = vqsub_s16(x5[6], x5[14]);
+  x7[15] = vqadd_s16(x5[7], x5[15]);
+
+  // stage 8
+  butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+  butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
+                                 &output[2]);
+  butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
+                                 &output[4]);
+  butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+  butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+  butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
+                                 &output[10]);
+  butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
+                                 &output[12]);
+  butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
+                                 &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
+                                            int16x8_t *output, int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+  // stage 2
+  int16x8_t x2[8];
+  butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+  butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+  butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+  // stage 3
+  int16x8_t x3[16];
+  x3[0] = vqaddq_s16(input[0], x2[0]);
+  x3[1] = vqsubq_s16(x2[1], input[15]);
+  x3[2] = vqsubq_s16(input[0], x2[0]);
+  x3[3] = vqaddq_s16(input[15], x2[1]);
+  x3[4] = vqsubq_s16(x2[2], input[3]);
+  x3[5] = vqaddq_s16(input[12], x2[3]);
+  x3[6] = vqaddq_s16(input[3], x2[2]);
+  x3[7] = vqsubq_s16(input[12], x2[3]);
+  x3[8] = vqsubq_s16(x2[4], input[1]);
+  x3[9] = vqaddq_s16(input[14], x2[5]);
+  x3[10] = vqaddq_s16(input[1], x2[4]);
+  x3[11] = vqsubq_s16(input[14], x2[5]);
+  x3[12] = vqaddq_s16(input[2], x2[6]);
+  x3[13] = vqsubq_s16(x2[7], input[13]);
+  x3[14] = vqsubq_s16(input[2], x2[6]);
+  x3[15] = vqaddq_s16(input[13], x2[7]);
+
+  // stage 4
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+  butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+  butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
 
   // stage 5
   int16x8_t x5[16];
   x5[0] = vqaddq_s16(x3[0], x3[4]);
-  x5[4] = vqsubq_s16(x3[0], x3[4]);
   x5[1] = vqaddq_s16(x3[1], x3[5]);
-  x5[5] = vqsubq_s16(x3[1], x3[5]);
   x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[3] = vqsubq_s16(x3[7], x3[3]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
   x5[6] = vqsubq_s16(x3[2], x3[6]);
-  x5[3] = vqaddq_s16(x3[3], x3[7]);
-  x5[7] = vqsubq_s16(x3[3], x3[7]);
+  x5[7] = vqaddq_s16(x3[3], x3[7]);
   x5[8] = vqaddq_s16(x3[8], x3[12]);
-  x5[12] = vqsubq_s16(x3[8], x3[12]);
   x5[9] = vqaddq_s16(x3[9], x3[13]);
-  x5[13] = vqsubq_s16(x3[9], x3[13]);
-  x5[10] = vqaddq_s16(x3[10], x3[14]);
-  x5[14] = vqsubq_s16(x3[10], x3[14]);
+  x5[10] = vqsubq_s16(x3[14], x3[10]);
   x5[11] = vqaddq_s16(x3[11], x3[15]);
+  x5[12] = vqsubq_s16(x3[8], x3[12]);
+  x5[13] = vqsubq_s16(x3[9], x3[13]);
+  x5[14] = vqaddq_s16(x3[10], x3[14]);
   x5[15] = vqsubq_s16(x3[11], x3[15]);
 
   // stage 6
-  btf_16_neon_mode3(cospi[8], cospi[56], x5[8], x5[9], x5[8], x5[9], v_cos_bit);
-  btf_16_neon_mode3(cospi[40], cospi[24], x5[10], x5[11], x5[10], x5[11],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[56], cospi[8], x5[12], x5[13], x5[12], x5[13],
-                    v_cos_bit);
-  btf_16_neon_mode0(cospi[24], cospi[40], x5[14], x5[15], x5[14], x5[15],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+  butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+  butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
 
   // stage 7
   int16x8_t x7[16];
   x7[0] = vqaddq_s16(x5[0], x5[8]);
-  x7[8] = vqsubq_s16(x5[0], x5[8]);
   x7[1] = vqaddq_s16(x5[1], x5[9]);
-  x7[9] = vqsubq_s16(x5[1], x5[9]);
   x7[2] = vqaddq_s16(x5[2], x5[10]);
-  x7[10] = vqsubq_s16(x5[2], x5[10]);
   x7[3] = vqaddq_s16(x5[3], x5[11]);
-  x7[11] = vqsubq_s16(x5[3], x5[11]);
   x7[4] = vqaddq_s16(x5[4], x5[12]);
-  x7[12] = vqsubq_s16(x5[4], x5[12]);
   x7[5] = vqaddq_s16(x5[5], x5[13]);
-  x7[13] = vqsubq_s16(x5[5], x5[13]);
   x7[6] = vqaddq_s16(x5[6], x5[14]);
+  x7[7] = vqsubq_s16(x5[15], x5[7]);
+  x7[8] = vqsubq_s16(x5[0], x5[8]);
+  x7[9] = vqsubq_s16(x5[1], x5[9]);
+  x7[10] = vqsubq_s16(x5[2], x5[10]);
+  x7[11] = vqsubq_s16(x5[3], x5[11]);
+  x7[12] = vqsubq_s16(x5[4], x5[12]);
+  x7[13] = vqsubq_s16(x5[5], x5[13]);
   x7[14] = vqsubq_s16(x5[6], x5[14]);
-  x7[7] = vqaddq_s16(x5[7], x5[15]);
-  x7[15] = vqsubq_s16(x5[7], x5[15]);
+  x7[15] = vqaddq_s16(x5[7], x5[15]);
 
   // stage 8
-  btf_16_neon_mode3(cospi[2], cospi[62], x7[0], x7[1], output[15], output[0],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[10], cospi[54], x7[2], x7[3], output[13], output[2],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[18], cospi[46], x7[4], x7[5], output[11], output[4],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[26], cospi[38], x7[6], x7[7], output[9], output[6],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[34], cospi[30], x7[8], x7[9], output[7], output[8],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[42], cospi[22], x7[10], x7[11], output[5], output[10],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[50], cospi[14], x7[12], x7[13], output[3], output[12],
-                    v_cos_bit);
-  btf_16_neon_mode3(cospi[58], cospi[6], x7[14], x7[15], output[1], output[14],
-                    v_cos_bit);
+  butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+  butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
+                                 &output[2]);
+  butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
+                                 &output[4]);
+  butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+  butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+  butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
+                                 &output[10]);
+  butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
+                                 &output[12]);
+  butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
+                                 &output[1]);
 }
 
-void av1_fidentity4x4_neon(const int16x8_t *const input,
-                           int16x8_t *const output, const int8_t cos_bit,
-                           const int8_t *stage_range) {
+static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
+                                               int16x4_t *const output,
+                                               const int cos_bit) {
   (void)cos_bit;
-  (void)stage_range;
-  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
-  for (int i = 0; i < 4; ++i) {
-    const int16x4_t b = vqrshrn_n_s32(
-        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
-    output[i] = vcombine_s16(b, b);
+  round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
+                                               int16x8_t *const output,
+                                               const int cos_bit) {
+  (void)cos_bit;
+  round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
+                                               int16x4_t *output, int cos_bit) {
+  (void)cos_bit;
+  shift_left_1_s16_x4(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
+                                               int16x8_t *output, int cos_bit) {
+  (void)cos_bit;
+  shift_left_1_s16_x8(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
+                                                int16x4_t *output,
+                                                int cos_bit) {
+  (void)cos_bit;
+  round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
+                                                int16x8_t *output,
+                                                int cos_bit) {
+  (void)cos_bit;
+  round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
+                                                int16x8_t *output,
+                                                int cos_bit) {
+  (void)cos_bit;
+  shift_left_2_s16_x8(input, output, 32);
+}
+
+#define TRANSFORM_COL(name, tw, n)                                          \
+  static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
+                              int stride, int cos_bit) {                    \
+    int16x##tw##_t buf0[n];                                                 \
+    load_buffer_s16_x##tw(input, stride, buf0, n);                          \
+    shift_left_2_s16_x##tw(buf0, buf0, n);                                  \
+    name##_neon(buf0, output, cos_bit);                                     \
   }
-}
 
-static INLINE void fidentity8x4_neon(const int16x8_t *const input,
-                                     int16x8_t *const output,
-                                     const int8_t cos_bit,
-                                     const int8_t *stage_range) {
-  (void)stage_range;
-  (void)cos_bit;
-  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
-  for (int i = 0; i < 4; ++i) {
-    const int16x4_t b_lo = vqrshrn_n_s32(
-        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
-    const int16x4_t b_hi = vqrshrn_n_s32(
-        vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
-    output[i] = vcombine_s16(b_lo, b_hi);
+TRANSFORM_COL(fadst4x4, 4, 4)
+TRANSFORM_COL(fadst4x8, 4, 8)
+TRANSFORM_COL(fadst4x16, 4, 16)
+TRANSFORM_COL(fadst8x4, 8, 4)
+TRANSFORM_COL(fadst8x8, 8, 8)
+TRANSFORM_COL(fadst8x16, 8, 16)
+TRANSFORM_COL(fdct4x4, 4, 4)
+TRANSFORM_COL(fdct4x8, 4, 8)
+TRANSFORM_COL(fdct4x16, 4, 16)
+TRANSFORM_COL(fdct8x4, 8, 4)
+TRANSFORM_COL(fdct8x8, 8, 8)
+TRANSFORM_COL(fdct8x16, 8, 16)
+TRANSFORM_COL(fdct8x32, 8, 32)
+TRANSFORM_COL(fidentity4x4, 4, 4)
+TRANSFORM_COL(fidentity4x8, 4, 8)
+TRANSFORM_COL(fidentity4x16, 4, 16)
+TRANSFORM_COL(fidentity8x4, 8, 4)
+TRANSFORM_COL(fidentity8x8, 8, 8)
+TRANSFORM_COL(fidentity8x16, 8, 16)
+TRANSFORM_COL(fidentity8x32, 8, 32)
+
+#define TRANSFORM_ROW(name, tw, n)                                          \
+  static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
+                              int stride, int cos_bit) {                    \
+    int16x##tw##_t buf0[n];                                                 \
+    name##_neon(input, buf0, cos_bit);                                      \
+    store_buffer_s16_x##tw(buf0, output, stride, n);                        \
   }
-}
 
-void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
-                       int8_t cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  (void)stage_range;
-  int16x8_t one = vdupq_n_s16(1);
-  output[0] = vqrshlq_s16(input[0], one);
-  output[1] = vqrshlq_s16(input[1], one);
-  output[2] = vqrshlq_s16(input[2], one);
-  output[3] = vqrshlq_s16(input[3], one);
-  output[4] = vqrshlq_s16(input[4], one);
-  output[5] = vqrshlq_s16(input[5], one);
-  output[6] = vqrshlq_s16(input[6], one);
-  output[7] = vqrshlq_s16(input[7], one);
-}
-
-static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
-                                      int8_t cos_bit,
-                                      const int8_t *stage_range) {
-  (void)stage_range;
-  (void)cos_bit;
-  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2 * 2);
-  for (int i = 0; i < 16; ++i) {
-    const int16x4_t b_lo = vqrshrn_n_s32(
-        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
-    const int16x4_t b_hi = vqrshrn_n_s32(
-        vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
-    output[i] = vcombine_s16(b_lo, b_hi);
+#define TRANSFORM_ROW_RECT(name, tw, n)                                        \
+  static void name##_row_rect_neon(const int16x##tw##_t *input,                \
+                                   int32_t *output, int stride, int cos_bit) { \
+    int16x##tw##_t buf0[n];                                                    \
+    name##_neon(input, buf0, cos_bit);                                         \
+    store_rect_buffer_s16_x##tw(buf0, output, stride, n);                      \
   }
-}
 
-static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
-                                      int8_t cos_bit,
-                                      const int8_t *stage_range) {
-  (void)stage_range;
-  (void)cos_bit;
-  for (int i = 0; i < 32; ++i) {
-    output[i] = vshlq_n_s16(input[i], 2);
-  }
-}
+TRANSFORM_ROW(fadst4x4, 4, 4)
+TRANSFORM_ROW(fadst4x16, 4, 16)
+TRANSFORM_ROW(fadst8x4, 8, 4)
+TRANSFORM_ROW(fadst8x8, 8, 8)
+TRANSFORM_ROW(fadst8x16, 8, 16)
+TRANSFORM_ROW(fdct4x4, 4, 4)
+TRANSFORM_ROW(fdct4x16, 4, 16)
+TRANSFORM_ROW(fdct8x4, 8, 4)
+TRANSFORM_ROW(fdct8x8, 8, 8)
+TRANSFORM_ROW(fdct8x16, 8, 16)
+TRANSFORM_ROW(fdct8x32, 8, 32)
+TRANSFORM_ROW(fidentity4x4, 4, 4)
+TRANSFORM_ROW(fidentity4x16, 4, 16)
+TRANSFORM_ROW(fidentity8x4, 8, 4)
+TRANSFORM_ROW(fidentity8x8, 8, 8)
+TRANSFORM_ROW(fidentity8x16, 8, 16)
+TRANSFORM_ROW(fidentity8x32, 8, 32)
 
-typedef void (*transform_1d_lbd_neon)(const int16x8_t *input, int16x8_t *output,
-                                      int8_t cos_bit,
-                                      const int8_t *stage_range);
+TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
+TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
+TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
+TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
+TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
+TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
+TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
+TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
+TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
+TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
+TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
+TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
 
-static const transform_1d_lbd_neon col_txfm4x4_arr[TX_TYPES] = {
-  av1_fdct4x4_neon,       // DCT_DCT
-  av1_fadst4x4_neon,      // ADST_DCT
-  av1_fdct4x4_neon,       // DCT_ADST
-  av1_fadst4x4_neon,      // ADST_ADST
-  av1_fadst4x4_neon,      // FLIPADST_DCT
-  av1_fdct4x4_neon,       // DCT_FLIPADST
-  av1_fadst4x4_neon,      // FLIPADST_FLIPADST
-  av1_fadst4x4_neon,      // ADST_FLIPADST
-  av1_fadst4x4_neon,      // FLIPADST_ADST
-  av1_fidentity4x4_neon,  // IDTX
-  av1_fdct4x4_neon,       // V_DCT
-  av1_fidentity4x4_neon,  // H_DCT
-  av1_fadst4x4_neon,      // V_ADST
-  av1_fidentity4x4_neon,  // H_ADST
-  av1_fadst4x4_neon,      // V_FLIPADST
-  av1_fidentity4x4_neon   // H_FLIPADST
+typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
+                                        int16x4_t *output, int cos_bit);
+typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
+                                        int16x8_t *output, int cos_bit);
+
+typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
+                                            int16x4_t *output, int stride,
+                                            int cos_bit);
+typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
+                                            int16x8_t *output, int stride,
+                                            int cos_bit);
+
+typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
+                                            int32_t *output, int stride,
+                                            int cos_bit);
+typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
+                                            int32_t *output, int stride,
+                                            int cos_bit);
+
+static const col_transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_col_neon,       // DCT_DCT
+  fadst4x4_col_neon,      // ADST_DCT
+  fdct4x4_col_neon,       // DCT_ADST
+  fadst4x4_col_neon,      // ADST_ADST
+  fadst4x4_col_neon,      // FLIPADST_DCT
+  fdct4x4_col_neon,       // DCT_FLIPADST
+  fadst4x4_col_neon,      // FLIPADST_FLIPADST
+  fadst4x4_col_neon,      // ADST_FLIPADST
+  fadst4x4_col_neon,      // FLIPADST_ADST
+  fidentity4x4_col_neon,  // IDTX
+  fdct4x4_col_neon,       // V_DCT
+  fidentity4x4_col_neon,  // H_DCT
+  fadst4x4_col_neon,      // V_ADST
+  fidentity4x4_col_neon,  // H_ADST
+  fadst4x4_col_neon,      // V_FLIPADST
+  fidentity4x4_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon row_txfm4x4_arr[TX_TYPES] = {
-  av1_fdct4x4_neon,       // DCT_DCT
-  av1_fdct4x4_neon,       // ADST_DCT
-  av1_fadst4x4_neon,      // DCT_ADST
-  av1_fadst4x4_neon,      // ADST_ADST
-  av1_fdct4x4_neon,       // FLIPADST_DCT
-  av1_fadst4x4_neon,      // DCT_FLIPADST
-  av1_fadst4x4_neon,      // FLIPADST_FLIPADST
-  av1_fadst4x4_neon,      // ADST_FLIPADST
-  av1_fadst4x4_neon,      // FLIPADST_ADST
-  av1_fidentity4x4_neon,  // IDTX
-  av1_fidentity4x4_neon,  // V_DCT
-  av1_fdct4x4_neon,       // H_DCT
-  av1_fidentity4x4_neon,  // V_ADST
-  av1_fadst4x4_neon,      // H_ADST
-  av1_fidentity4x4_neon,  // V_FLIPADST
-  av1_fadst4x4_neon       // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_row_neon,       // DCT_DCT
+  fdct4x4_row_neon,       // ADST_DCT
+  fadst4x4_row_neon,      // DCT_ADST
+  fadst4x4_row_neon,      // ADST_ADST
+  fdct4x4_row_neon,       // FLIPADST_DCT
+  fadst4x4_row_neon,      // DCT_FLIPADST
+  fadst4x4_row_neon,      // FLIPADST_FLIPADST
+  fadst4x4_row_neon,      // ADST_FLIPADST
+  fadst4x4_row_neon,      // FLIPADST_ADST
+  fidentity4x4_row_neon,  // IDTX
+  fidentity4x4_row_neon,  // V_DCT
+  fdct4x4_row_neon,       // H_DCT
+  fidentity4x4_row_neon,  // V_ADST
+  fadst4x4_row_neon,      // H_ADST
+  fidentity4x4_row_neon,  // V_FLIPADST
+  fadst4x4_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon col_txfm4x8_arr[TX_TYPES] = {
-  fdct4x8_neon,       // DCT_DCT
-  fadst4x8_neon,      // ADST_DCT
-  fdct4x8_neon,       // DCT_ADST
-  fadst4x8_neon,      // ADST_ADST
-  fadst4x8_neon,      // FLIPADST_DCT
-  fdct4x8_neon,       // DCT_FLIPADST
-  fadst4x8_neon,      // FLIPADST_FLIPADST
-  fadst4x8_neon,      // ADST_FLIPADST
-  fadst4x8_neon,      // FLIPADST_ADST
-  fidentity8x8_neon,  // IDTX
-  fdct4x8_neon,       // V_DCT
-  fidentity8x8_neon,  // H_DCT
-  fadst4x8_neon,      // V_ADST
-  fidentity8x8_neon,  // H_ADST
-  fadst4x8_neon,      // V_FLIPADST
-  fidentity8x8_neon   // H_FLIPADST
+static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_col_neon,       // DCT_DCT
+  fadst4x8_col_neon,      // ADST_DCT
+  fdct4x8_col_neon,       // DCT_ADST
+  fadst4x8_col_neon,      // ADST_ADST
+  fadst4x8_col_neon,      // FLIPADST_DCT
+  fdct4x8_col_neon,       // DCT_FLIPADST
+  fadst4x8_col_neon,      // FLIPADST_FLIPADST
+  fadst4x8_col_neon,      // ADST_FLIPADST
+  fadst4x8_col_neon,      // FLIPADST_ADST
+  fidentity4x8_col_neon,  // IDTX
+  fdct4x8_col_neon,       // V_DCT
+  fidentity4x8_col_neon,  // H_DCT
+  fadst4x8_col_neon,      // V_ADST
+  fidentity4x8_col_neon,  // H_ADST
+  fadst4x8_col_neon,      // V_FLIPADST
+  fidentity4x8_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon row_txfm8x4_arr[TX_TYPES] = {
-  fdct8x4_neon,       // DCT_DCT
-  fdct8x4_neon,       // ADST_DCT
-  fadst8x4_neon,      // DCT_ADST
-  fadst8x4_neon,      // ADST_ADST
-  fdct8x4_neon,       // FLIPADST_DCT
-  fadst8x4_neon,      // DCT_FLIPADST
-  fadst8x4_neon,      // FLIPADST_FLIPADST
-  fadst8x4_neon,      // ADST_FLIPADST
-  fadst8x4_neon,      // FLIPADST_ADST
-  fidentity8x4_neon,  // IDTX
-  fidentity8x4_neon,  // V_DCT
-  fdct8x4_neon,       // H_DCT
-  fidentity8x4_neon,  // V_ADST
-  fadst8x4_neon,      // H_ADST
-  fidentity8x4_neon,  // V_FLIPADST
-  fadst8x4_neon       // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_row_neon,       // DCT_DCT
+  fdct8x4_row_neon,       // ADST_DCT
+  fadst8x4_row_neon,      // DCT_ADST
+  fadst8x4_row_neon,      // ADST_ADST
+  fdct8x4_row_neon,       // FLIPADST_DCT
+  fadst8x4_row_neon,      // DCT_FLIPADST
+  fadst8x4_row_neon,      // FLIPADST_FLIPADST
+  fadst8x4_row_neon,      // ADST_FLIPADST
+  fadst8x4_row_neon,      // FLIPADST_ADST
+  fidentity8x4_row_neon,  // IDTX
+  fidentity8x4_row_neon,  // V_DCT
+  fdct8x4_row_neon,       // H_DCT
+  fidentity8x4_row_neon,  // V_ADST
+  fadst8x4_row_neon,      // H_ADST
+  fidentity8x4_row_neon,  // V_FLIPADST
+  fadst8x4_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon col_txfm8x4_arr[TX_TYPES] = {
-  fdct8x4_neon,       // DCT_DCT
-  fadst8x4_neon,      // ADST_DCT
-  fdct8x4_neon,       // DCT_ADST
-  fadst8x4_neon,      // ADST_ADST
-  fadst8x4_neon,      // FLIPADST_DCT
-  fdct8x4_neon,       // DCT_FLIPADST
-  fadst8x4_neon,      // FLIPADST_FLIPADST
-  fadst8x4_neon,      // ADST_FLIPADST
-  fadst8x4_neon,      // FLIPADST_ADST
-  fidentity8x4_neon,  // IDTX
-  fdct8x4_neon,       // V_DCT
-  fidentity8x4_neon,  // H_DCT
-  fadst8x4_neon,      // V_ADST
-  fidentity8x4_neon,  // H_ADST
-  fadst8x4_neon,      // V_FLIPADST
-  fidentity8x4_neon   // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_row_rect_neon,       // DCT_DCT
+  fdct8x4_row_rect_neon,       // ADST_DCT
+  fadst8x4_row_rect_neon,      // DCT_ADST
+  fadst8x4_row_rect_neon,      // ADST_ADST
+  fdct8x4_row_rect_neon,       // FLIPADST_DCT
+  fadst8x4_row_rect_neon,      // DCT_FLIPADST
+  fadst8x4_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x4_row_rect_neon,      // ADST_FLIPADST
+  fadst8x4_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x4_row_rect_neon,  // IDTX
+  fidentity8x4_row_rect_neon,  // V_DCT
+  fdct8x4_row_rect_neon,       // H_DCT
+  fidentity8x4_row_rect_neon,  // V_ADST
+  fadst8x4_row_rect_neon,      // H_ADST
+  fidentity8x4_row_rect_neon,  // V_FLIPADST
+  fadst8x4_row_rect_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon row_txfm4x8_arr[TX_TYPES] = {
-  fdct4x8_neon,       // DCT_DCT
-  fdct4x8_neon,       // ADST_DCT
-  fadst4x8_neon,      // DCT_ADST
-  fadst4x8_neon,      // ADST_ADST
-  fdct4x8_neon,       // FLIPADST_DCT
-  fadst4x8_neon,      // DCT_FLIPADST
-  fadst4x8_neon,      // FLIPADST_FLIPADST
-  fadst4x8_neon,      // ADST_FLIPADST
-  fadst4x8_neon,      // FLIPADST_ADST
-  fidentity8x8_neon,  // IDTX
-  fidentity8x8_neon,  // V_DCT
-  fdct4x8_neon,       // H_DCT
-  fidentity8x8_neon,  // V_ADST
-  fadst4x8_neon,      // H_ADST
-  fidentity8x8_neon,  // V_FLIPADST
-  fadst4x8_neon       // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_col_neon,       // DCT_DCT
+  fadst8x4_col_neon,      // ADST_DCT
+  fdct8x4_col_neon,       // DCT_ADST
+  fadst8x4_col_neon,      // ADST_ADST
+  fadst8x4_col_neon,      // FLIPADST_DCT
+  fdct8x4_col_neon,       // DCT_FLIPADST
+  fadst8x4_col_neon,      // FLIPADST_FLIPADST
+  fadst8x4_col_neon,      // ADST_FLIPADST
+  fadst8x4_col_neon,      // FLIPADST_ADST
+  fidentity8x4_col_neon,  // IDTX
+  fdct8x4_col_neon,       // V_DCT
+  fidentity8x4_col_neon,  // H_DCT
+  fadst8x4_col_neon,      // V_ADST
+  fidentity8x4_col_neon,  // H_ADST
+  fadst8x4_col_neon,      // V_FLIPADST
+  fidentity8x4_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon col_txfm8x8_arr[TX_TYPES] = {
-  fdct8x8_neon,       // DCT_DCT
-  fadst_8x8_neon,     // ADST_DCT
-  fdct8x8_neon,       // DCT_ADST
-  fadst_8x8_neon,     // ADST_ADST
-  fadst_8x8_neon,     // FLIPADST_DCT
-  fdct8x8_neon,       // DCT_FLIPADST
-  fadst_8x8_neon,     // FLIPADST_FLIPADST
-  fadst_8x8_neon,     // ADST_FLIPADST
-  fadst_8x8_neon,     // FLIPADST_ADST
-  fidentity8x8_neon,  // IDTX
-  fdct8x8_neon,       // V_DCT
-  fidentity8x8_neon,  // H_DCT
-  fadst_8x8_neon,     // V_ADST
-  fidentity8x8_neon,  // H_ADST
-  fadst_8x8_neon,     // V_FLIPADST
-  fidentity8x8_neon,  // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_row_rect_neon,       // DCT_DCT
+  fdct4x8_row_rect_neon,       // ADST_DCT
+  fadst4x8_row_rect_neon,      // DCT_ADST
+  fadst4x8_row_rect_neon,      // ADST_ADST
+  fdct4x8_row_rect_neon,       // FLIPADST_DCT
+  fadst4x8_row_rect_neon,      // DCT_FLIPADST
+  fadst4x8_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst4x8_row_rect_neon,      // ADST_FLIPADST
+  fadst4x8_row_rect_neon,      // FLIPADST_ADST
+  fidentity4x8_row_rect_neon,  // IDTX
+  fidentity4x8_row_rect_neon,  // V_DCT
+  fdct4x8_row_rect_neon,       // H_DCT
+  fidentity4x8_row_rect_neon,  // V_ADST
+  fadst4x8_row_rect_neon,      // H_ADST
+  fidentity4x8_row_rect_neon,  // V_FLIPADST
+  fadst4x8_row_rect_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon row_txfm8x8_arr[TX_TYPES] = {
-  fdct8x8_neon,       // DCT_DCT
-  fdct8x8_neon,       // ADST_DCT
-  fadst_8x8_neon,     // DCT_ADST
-  fadst_8x8_neon,     // ADST_ADST
-  fdct8x8_neon,       // FLIPADST_DCT
-  fadst_8x8_neon,     // DCT_FLIPADST
-  fadst_8x8_neon,     // FLIPADST_FLIPADST
-  fadst_8x8_neon,     // ADST_FLIPADST
-  fadst_8x8_neon,     // FLIPADST_ADST
-  fidentity8x8_neon,  // IDTX
-  fidentity8x8_neon,  // V_DCT
-  fdct8x8_neon,       // H_DCT
-  fidentity8x8_neon,  // V_ADST
-  fadst_8x8_neon,     // H_ADST
-  fidentity8x8_neon,  // V_FLIPADST
-  fadst_8x8_neon      // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_col_neon,       // DCT_DCT
+  fadst8x8_col_neon,      // ADST_DCT
+  fdct8x8_col_neon,       // DCT_ADST
+  fadst8x8_col_neon,      // ADST_ADST
+  fadst8x8_col_neon,      // FLIPADST_DCT
+  fdct8x8_col_neon,       // DCT_FLIPADST
+  fadst8x8_col_neon,      // FLIPADST_FLIPADST
+  fadst8x8_col_neon,      // ADST_FLIPADST
+  fadst8x8_col_neon,      // FLIPADST_ADST
+  fidentity8x8_col_neon,  // IDTX
+  fdct8x8_col_neon,       // V_DCT
+  fidentity8x8_col_neon,  // H_DCT
+  fadst8x8_col_neon,      // V_ADST
+  fidentity8x8_col_neon,  // H_ADST
+  fadst8x8_col_neon,      // V_FLIPADST
+  fidentity8x8_col_neon,  // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon col_txfm8x16_arr[TX_TYPES] = {
-  fdct8x16_neon,       // DCT_DCT
-  fadst8x16_neon,      // ADST_DCT
-  fdct8x16_neon,       // DCT_ADST
-  fadst8x16_neon,      // ADST_ADST
-  fadst8x16_neon,      // FLIPADST_DCT
-  fdct8x16_neon,       // DCT_FLIPADST
-  fadst8x16_neon,      // FLIPADST_FLIPADST
-  fadst8x16_neon,      // ADST_FLIPADST
-  fadst8x16_neon,      // FLIPADST_ADST
-  fidentity8x16_neon,  // IDTX
-  fdct8x16_neon,       // V_DCT
-  fidentity8x16_neon,  // H_DCT
-  fadst8x16_neon,      // V_ADST
-  fidentity8x16_neon,  // H_ADST
-  fadst8x16_neon,      // V_FLIPADST
-  fidentity8x16_neon   // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_row_neon,       // DCT_DCT
+  fdct8x8_row_neon,       // ADST_DCT
+  fadst8x8_row_neon,      // DCT_ADST
+  fadst8x8_row_neon,      // ADST_ADST
+  fdct8x8_row_neon,       // FLIPADST_DCT
+  fadst8x8_row_neon,      // DCT_FLIPADST
+  fadst8x8_row_neon,      // FLIPADST_FLIPADST
+  fadst8x8_row_neon,      // ADST_FLIPADST
+  fadst8x8_row_neon,      // FLIPADST_ADST
+  fidentity8x8_row_neon,  // IDTX
+  fidentity8x8_row_neon,  // V_DCT
+  fdct8x8_row_neon,       // H_DCT
+  fidentity8x8_row_neon,  // V_ADST
+  fadst8x8_row_neon,      // H_ADST
+  fidentity8x8_row_neon,  // V_FLIPADST
+  fadst8x8_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon row_txfm8x16_arr[TX_TYPES] = {
-  fdct8x16_neon,       // DCT_DCT
-  fdct8x16_neon,       // ADST_DCT
-  fadst8x16_neon,      // DCT_ADST
-  fadst8x16_neon,      // ADST_ADST
-  fdct8x16_neon,       // FLIPADST_DCT
-  fadst8x16_neon,      // DCT_FLIPADST
-  fadst8x16_neon,      // FLIPADST_FLIPADST
-  fadst8x16_neon,      // ADST_FLIPADST
-  fadst8x16_neon,      // FLIPADST_ADST
-  fidentity8x16_neon,  // IDTX
-  fidentity8x16_neon,  // V_DCT
-  fdct8x16_neon,       // H_DCT
-  fidentity8x16_neon,  // V_ADST
-  fadst8x16_neon,      // H_ADST
-  fidentity8x16_neon,  // V_FLIPADST
-  fadst8x16_neon       // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_row_rect_neon,       // DCT_DCT
+  fdct8x8_row_rect_neon,       // ADST_DCT
+  fadst8x8_row_rect_neon,      // DCT_ADST
+  fadst8x8_row_rect_neon,      // ADST_ADST
+  fdct8x8_row_rect_neon,       // FLIPADST_DCT
+  fadst8x8_row_rect_neon,      // DCT_FLIPADST
+  fadst8x8_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x8_row_rect_neon,      // ADST_FLIPADST
+  fadst8x8_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x8_row_rect_neon,  // IDTX
+  fidentity8x8_row_rect_neon,  // V_DCT
+  fdct8x8_row_rect_neon,       // H_DCT
+  fidentity8x8_row_rect_neon,  // V_ADST
+  fadst8x8_row_rect_neon,      // H_ADST
+  fidentity8x8_row_rect_neon,  // V_FLIPADST
+  fadst8x8_row_rect_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon row_txfm8x32_arr[TX_TYPES] = {
-  av1_fdct8x32_neon,   // DCT_DCT
-  NULL,                // ADST_DCT
-  NULL,                // DCT_ADST
-  NULL,                // ADST_ADST
-  NULL,                // FLIPADST_DCT
-  NULL,                // DCT_FLIPADST
-  NULL,                // FLIPADST_FLIPADST
-  NULL,                // ADST_FLIPADST
-  NULL,                // FLIPADST_ADST
-  fidentity8x32_neon,  // IDTX
-  fidentity8x32_neon,  // V_DCT
-  av1_fdct8x32_neon,   // H_DCT
-  NULL,                // V_ADST
-  NULL,                // H_ADST
-  NULL,                // V_FLIPADST
-  NULL                 // H_FLIPADST
+static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
+  fdct4x16_col_neon,       // DCT_DCT
+  fadst4x16_col_neon,      // ADST_DCT
+  fdct4x16_col_neon,       // DCT_ADST
+  fadst4x16_col_neon,      // ADST_ADST
+  fadst4x16_col_neon,      // FLIPADST_DCT
+  fdct4x16_col_neon,       // DCT_FLIPADST
+  fadst4x16_col_neon,      // FLIPADST_FLIPADST
+  fadst4x16_col_neon,      // ADST_FLIPADST
+  fadst4x16_col_neon,      // FLIPADST_ADST
+  fidentity4x16_col_neon,  // IDTX
+  fdct4x16_col_neon,       // V_DCT
+  fidentity4x16_col_neon,  // H_DCT
+  fadst4x16_col_neon,      // V_ADST
+  fidentity4x16_col_neon,  // H_ADST
+  fadst4x16_col_neon,      // V_FLIPADST
+  fidentity4x16_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_neon col_txfm8x32_arr[TX_TYPES] = {
-  av1_fdct8x32_neon,   // DCT_DCT
-  NULL,                // ADST_DCT
-  NULL,                // DCT_ADST
-  NULL,                // ADST_ADST
-  NULL,                // FLIPADST_DCT
-  NULL,                // DCT_FLIPADST
-  NULL,                // FLIPADST_FLIPADST
-  NULL,                // ADST_FLIPADST
-  NULL,                // FLIPADST_ADST
-  fidentity8x32_neon,  // IDTX
-  av1_fdct8x32_neon,   // V_DCT
-  fidentity8x32_neon,  // H_DCT
-  NULL,                // V_ADST
-  NULL,                // H_ADST
-  NULL,                // V_FLIPADST
-  NULL                 // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
+  fdct4x16_row_neon,       // DCT_DCT
+  fdct4x16_row_neon,       // ADST_DCT
+  fadst4x16_row_neon,      // DCT_ADST
+  fadst4x16_row_neon,      // ADST_ADST
+  fdct4x16_row_neon,       // FLIPADST_DCT
+  fadst4x16_row_neon,      // DCT_FLIPADST
+  fadst4x16_row_neon,      // FLIPADST_FLIPADST
+  fadst4x16_row_neon,      // ADST_FLIPADST
+  fadst4x16_row_neon,      // FLIPADST_ADST
+  fidentity4x16_row_neon,  // IDTX
+  fidentity4x16_row_neon,  // V_DCT
+  fdct4x16_row_neon,       // H_DCT
+  fidentity4x16_row_neon,  // V_ADST
+  fadst4x16_row_neon,      // H_ADST
+  fidentity4x16_row_neon,  // V_FLIPADST
+  fadst4x16_row_neon       // H_FLIPADST
 };
 
-void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
-                                   int stride, TX_TYPE tx_type, int bd) {
+static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_col_neon,       // DCT_DCT
+  fadst8x16_col_neon,      // ADST_DCT
+  fdct8x16_col_neon,       // DCT_ADST
+  fadst8x16_col_neon,      // ADST_ADST
+  fadst8x16_col_neon,      // FLIPADST_DCT
+  fdct8x16_col_neon,       // DCT_FLIPADST
+  fadst8x16_col_neon,      // FLIPADST_FLIPADST
+  fadst8x16_col_neon,      // ADST_FLIPADST
+  fadst8x16_col_neon,      // FLIPADST_ADST
+  fidentity8x16_col_neon,  // IDTX
+  fdct8x16_col_neon,       // V_DCT
+  fidentity8x16_col_neon,  // H_DCT
+  fadst8x16_col_neon,      // V_ADST
+  fidentity8x16_col_neon,  // H_ADST
+  fadst8x16_col_neon,      // V_FLIPADST
+  fidentity8x16_col_neon   // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_row_neon,       // DCT_DCT
+  fdct8x16_row_neon,       // ADST_DCT
+  fadst8x16_row_neon,      // DCT_ADST
+  fadst8x16_row_neon,      // ADST_ADST
+  fdct8x16_row_neon,       // FLIPADST_DCT
+  fadst8x16_row_neon,      // DCT_FLIPADST
+  fadst8x16_row_neon,      // FLIPADST_FLIPADST
+  fadst8x16_row_neon,      // ADST_FLIPADST
+  fadst8x16_row_neon,      // FLIPADST_ADST
+  fidentity8x16_row_neon,  // IDTX
+  fidentity8x16_row_neon,  // V_DCT
+  fdct8x16_row_neon,       // H_DCT
+  fidentity8x16_row_neon,  // V_ADST
+  fadst8x16_row_neon,      // H_ADST
+  fidentity8x16_row_neon,  // V_FLIPADST
+  fadst8x16_row_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_row_rect_neon,       // DCT_DCT
+  fdct8x16_row_rect_neon,       // ADST_DCT
+  fadst8x16_row_rect_neon,      // DCT_ADST
+  fadst8x16_row_rect_neon,      // ADST_ADST
+  fdct8x16_row_rect_neon,       // FLIPADST_DCT
+  fadst8x16_row_rect_neon,      // DCT_FLIPADST
+  fadst8x16_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x16_row_rect_neon,      // ADST_FLIPADST
+  fadst8x16_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x16_row_rect_neon,  // IDTX
+  fidentity8x16_row_rect_neon,  // V_DCT
+  fdct8x16_row_rect_neon,       // H_DCT
+  fidentity8x16_row_rect_neon,  // V_ADST
+  fadst8x16_row_rect_neon,      // H_ADST
+  fidentity8x16_row_rect_neon,  // V_FLIPADST
+  fadst8x16_row_rect_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_row_neon,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_row_neon,  // IDTX
+  fidentity8x32_row_neon,  // V_DCT
+  fdct8x32_row_neon,       // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_row_rect_neon,       // DCT_DCT
+  NULL,                         // ADST_DCT
+  NULL,                         // DCT_ADST
+  NULL,                         // ADST_ADST
+  NULL,                         // FLIPADST_DCT
+  NULL,                         // DCT_FLIPADST
+  NULL,                         // FLIPADST_FLIPADST
+  NULL,                         // ADST_FLIPADST
+  NULL,                         // FLIPADST_ADST
+  fidentity8x32_row_rect_neon,  // IDTX
+  fidentity8x32_row_rect_neon,  // V_DCT
+  fdct8x32_row_rect_neon,       // H_DCT
+  NULL,                         // V_ADST
+  NULL,                         // H_ADST
+  NULL,                         // V_FLIPADST
+  NULL                          // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_col_neon,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_col_neon,  // IDTX
+  fdct8x32_col_neon,       // V_DCT
+  fidentity8x32_col_neon,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  int16x8_t buf0[4], buf1[4], *buf;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
-  const int txw_idx = get_txw_idx(TX_4X4);
-  const int txh_idx = get_txh_idx(TX_4X4);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 4;
-  const int height = 4;
-  const transform_1d_lbd_neon col_txfm = col_txfm4x4_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm4x4_arr[tx_type];
+  int16x4_t buf0[4], buf1[4];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
-  } else {
-    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
-  }
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_4x4(buf0, buf1);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+  col_txfm(input, buf0, stride, 13);
+  transpose_arrays_s16_4x4(buf0, buf1);
 
   if (lr_flip) {
-    buf = buf0;
-    flip_buf_neon(buf1, buf, width);
+    flip_buf_4_neon(buf1, buf0, 4);
+    row_txfm(buf0, output, 4, 13);
   } else {
-    buf = buf1;
+    row_txfm(buf1, output, 4, 13);
   }
-  row_txfm(buf, buf, cos_bit_row, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift2);
-
-  store_buffer_16bit_to_32bit_w4(buf, output, height, width);
 }
 
-void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
-                                   int stride, TX_TYPE tx_type, int bd) {
-  (void)stride;
+static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  int16x8_t buf0[8], buf1[8], *buf;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
-  const int txw_idx = get_txw_idx(TX_4X8);
-  const int txh_idx = get_txh_idx(TX_4X8);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 4;
-  const int height = 8;
-  const transform_1d_lbd_neon col_txfm = col_txfm4x8_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
-  int ud_flip, lr_flip;
+  int16x4_t buf0[8];
+  int16x8_t buf1[8];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
 
+  int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
-  } else {
-    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
-  }
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_4x8(buf0, buf1);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x4(buf0, buf0, 8);
+  transpose_arrays_s16_4x8(buf0, buf1);
 
   if (lr_flip) {
-    buf = buf0;
-    flip_buf_neon(buf1, buf, width);
+    int16x8_t buf2[8];
+    flip_buf_8_neon(buf1, buf2, 4);
+    row_txfm(buf2, output, 8, 13);
   } else {
-    buf = buf1;
+    row_txfm(buf1, output, 8, 13);
   }
-  row_txfm(buf, buf, cos_bit_row, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift2);
-  store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
 }
 
-void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
-                                    int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  int16x8_t buf0[16], buf1[16];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
-  const int txw_idx = get_txw_idx(TX_4X16);
-  const int txh_idx = get_txh_idx(TX_4X16);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 4;
-  const int height = 16;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
+  int16x4_t buf0[16];
+  int16x8_t buf1[16];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
-  } else {
-    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
-  }
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_4x8(buf0, buf1);
-  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x4(buf0, buf0, 16);
+  transpose_arrays_s16_4x8(buf0, buf1);
+  transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
 
   for (int i = 0; i < 2; i++) {
-    int16x8_t *buf;
     if (lr_flip) {
-      buf = buf0;
-      flip_buf_neon(buf1 + 8 * i, buf, width);
+      int16x8_t buf2[16];
+      flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
+      row_txfm(buf2, output + 8 * i, 16, 12);
     } else {
-      buf = buf1 + 8 * i;
+      int16x8_t *buf = buf1 + 8 * i;
+      row_txfm(buf, output + 8 * i, 16, 12);
     }
-    row_txfm(buf, buf, cos_bit_row, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift2);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
   }
 }
 
-void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
-                                   int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  int16x8_t buf0[8], buf1[8], *buf;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
-  const int txw_idx = get_txw_idx(TX_8X4);
-  const int txh_idx = get_txh_idx(TX_8X4);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 8;
-  const int height = 4;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm4x8_arr[tx_type];
+  int16x8_t buf0[8];
+  int16x4_t buf1[8];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip)
-    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
-  else
-    load_buffer_16bit_to_16bit(input, stride, buf0, height);
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_8x8(buf0, buf1);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x8(buf0, buf0, 4);
+  transpose_arrays_s16_8x4(buf0, buf1);
 
   if (lr_flip) {
-    buf = buf0;
-    flip_buf_neon(buf1, buf, width);
+    int16x4_t buf2[8];
+    flip_buf_4_neon(buf1, buf2, 8);
+    row_txfm(buf2, output, 4, 13);
   } else {
-    buf = buf1;
+    row_txfm(buf1, output, 4, 13);
   }
-  row_txfm(buf, buf, cos_bit_row, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift2);
-  store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width);
 }
 
-void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
-                                   int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+                                      int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  int16x8_t buf0[8], buf1[8], *buf;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
-  const int txw_idx = get_txw_idx(TX_8X8);
-  const int txh_idx = get_txh_idx(TX_8X8);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 8;
-  const int height = 8;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int16x8_t buf0[8], buf1[8];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip)
-    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
-  else
-    load_buffer_16bit_to_16bit(input, stride, buf0, height);
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_8x8(buf0, buf1);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_1_round_s16_x8(buf0, buf0, 8);
+  transpose_arrays_s16_8x8(buf0, buf1);
 
   if (lr_flip) {
-    buf = buf0;
-    flip_buf_neon(buf1, buf, width);
+    flip_buf_8_neon(buf1, buf0, 8);
+    row_txfm(buf0, output, 8, 13);
   } else {
-    buf = buf1;
+    row_txfm(buf1, output, 8, 13);
   }
-  row_txfm(buf, buf, cos_bit_row, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift2);
-  store_buffer_16bit_to_32bit_w8(buf, output, height, width);
 }
 
-void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
-                                    int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[16], buf1[16];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
-  const int txw_idx = get_txw_idx(TX_8X16);
-  const int txh_idx = get_txh_idx(TX_8X16);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 8;
-  const int height = 16;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
-  } else {
-    load_buffer_16bit_to_16bit(input, stride, buf0, height);
-  }
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_8x8(buf0, buf1);
-  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  col_txfm(input, buf0, stride, 13);
+  shift_right_2_round_s16_x8(buf0, buf0, 16);
+  transpose_arrays_s16_8x8(buf0, buf1);
+  transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
 
   for (int i = 0; i < 2; i++) {
-    int16x8_t *buf;
     if (lr_flip) {
-      buf = buf0;
-      flip_buf_neon(buf1 + width * i, buf, width);
+      flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+      row_txfm(buf0, output + 8 * i, 16, 13);
     } else {
-      buf = buf1 + width * i;
+      int16x8_t *buf = buf1 + 8 * i;
+      row_txfm(buf, output + 8 * i, 16, 13);
     }
-    row_txfm(buf, buf, cos_bit_row, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift2);
-    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, 8);
   }
 }
 
-void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
-                                    int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[32];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
-  const int txw_idx = get_txw_idx(TX_8X32);
-  const int txh_idx = get_txh_idx(TX_8X32);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 8;
-  const int height = 32;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
-  } else {
-    load_buffer_16bit_to_16bit(input, stride, buf0, height);
-  }
-  round_shift_16bit_vector(buf0, height, &v_shift0);
-  col_txfm(buf0, buf0, cos_bit_col, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift1);
-  transpose_16bit_8x8(buf0, buf1);
-  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
-  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
-  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+  col_txfm(input, buf0, stride, 12);
+  shift_right_2_round_s16_x8(buf0, buf0, 32);
+  transpose_arrays_s16_8x8(buf0, buf1);
+  transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+  transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
+  transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
 
   for (int i = 0; i < 4; i++) {
-    int16x8_t *buf;
     if (lr_flip) {
-      buf = buf0;
-      flip_buf_neon(buf1 + width * i, buf, width);
+      flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+      row_txfm(buf0, output + 8 * i, 32, 12);
     } else {
-      buf = buf1 + width * i;
+      int16x8_t *buf = buf1 + 8 * i;
+      row_txfm(buf, output + 8 * i, 32, 12);
     }
-    row_txfm(buf, buf, cos_bit_row, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift2);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
   }
 }
 
-void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
-                                    int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  int16x8_t buf0[16], buf1[16];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
-  const int txw_idx = get_txw_idx(TX_16X4);
-  const int txh_idx = get_txh_idx(TX_16X4);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 16;
-  const int height = 4;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
-  int16x8_t *buf;
+  int16x8_t buf0[16];
+  int16x4_t buf1[16];
+  int16x4_t buf2[16];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   for (int i = 0; i < 2; i++) {
-    if (ud_flip) {
-      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-    } else {
-      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    }
-    round_shift_16bit_vector(buf0, height, &v_shift0);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift1);
-    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_1_round_s16_x8(buf0, buf0, 4);
+    transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
   }
 
   if (lr_flip) {
-    buf = buf0;
-    flip_buf_neon(buf1, buf, width);
+    flip_buf_4_neon(buf1, buf2, 16);
+    row_txfm(buf2, output, 4, 13);
   } else {
-    buf = buf1;
+    row_txfm(buf1, output, 4, 13);
   }
-  row_txfm(buf, buf, cos_bit_row, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift2);
-  store_buffer_16bit_to_32bit_w4(buf, output, height, width);
 }
 
-void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
-                                    int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[16], buf1[16];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
-  const int txw_idx = get_txw_idx(TX_16X8);
-  const int txh_idx = get_txh_idx(TX_16X8);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 16;
-  const int height = 8;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
-  int16x8_t *buf;
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   for (int i = 0; i < 2; i++) {
-    if (ud_flip) {
-      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-    } else {
-      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    }
-    round_shift_16bit_vector(buf0, height, &v_shift0);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift1);
-    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 8);
+    transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
   }
 
   if (lr_flip) {
-    buf = buf0;
-    flip_buf_neon(buf1, buf, width);
+    flip_buf_8_neon(buf1, buf0, 16);
+    row_txfm(buf0, output, 8, 13);
   } else {
-    buf = buf1;
+    row_txfm(buf1, output, 8, 13);
   }
-  row_txfm(buf, buf, cos_bit_row, NULL);
-  round_shift_16bit_vector(buf0, height, &v_shift2);
-  store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
 }
 
-void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[16], buf1[32];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
-  const int txw_idx = get_txw_idx(TX_16X16);
-  const int txh_idx = get_txh_idx(TX_16X16);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 16;
-  const int height = 16;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-  const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-  const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   for (int i = 0; i < 2; i++) {
-    if (ud_flip) {
-      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-    } else {
-      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    }
-    round_shift_16bit_vector(buf0, height, &v_shift0);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift1);
-    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
-    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 16);
+    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
   }
 
   for (int i = 0; i < 2; i++) {
-    int16x8_t *buf;
     if (lr_flip) {
-      buf = buf0;
-      flip_buf_neon(buf1 + width * i, buf, width);
+      flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+      row_txfm(buf0, output + 8 * i, 16, 12);
     } else {
-      buf = buf1 + width * i;
+      int16x8_t *buf = buf1 + 16 * i;
+      row_txfm(buf, output + 8 * i, 16, 12);
     }
-    row_txfm(buf, buf, cos_bit_row, NULL);
-    round_shift_16bit_vector(buf0, height, &v_shift2);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
   }
 }
 
-void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[64];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
-  const int txw_idx = get_txw_idx(TX_16X32);
-  const int txh_idx = get_txh_idx(TX_16X32);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 16;
-  const int height = 32;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-    const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-    const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-    const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-
-    for (int i = 0; i < 2; i++) {
-      if (ud_flip) {
-        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-      } else {
-        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-      }
-      round_shift_16bit_vector(buf0, height, &v_shift0);
-      col_txfm(buf0, buf0, cos_bit_col, NULL);
-      round_shift_16bit_vector(buf0, height, &v_shift1);
-      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
-    }
-
-    for (int i = 0; i < 4; i++) {
-      int16x8_t *buf;
-      if (lr_flip) {
-        buf = buf0;
-        flip_buf_neon(buf1 + width * i, buf, width);
-      } else {
-        buf = buf1 + width * i;
-      }
-      row_txfm(buf, buf, cos_bit_row, NULL);
-      round_shift_16bit_vector(buf0, height, &v_shift2);
-      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
-    }
-  } else {
+  if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 12);
+    shift_right_4_round_s16_x8(buf0, buf0, 32);
+    transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
+  }
+
+  for (int i = 0; i < 4; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+      row_txfm(buf0, output + 8 * i, 32, 13);
+    } else {
+      int16x8_t *buf = buf1 + 16 * i;
+      row_txfm(buf, output + 8 * i, 32, 13);
+    }
   }
 }
 
-void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
-                                    int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[32];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
-  const int txw_idx = get_txw_idx(TX_32X8);
-  const int txh_idx = get_txh_idx(TX_32X8);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 32;
-  const int height = 8;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-    const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-    const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-    const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-
-    for (int i = 0; i < 4; i++) {
-      if (ud_flip) {
-        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-      } else {
-        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-      }
-      round_shift_16bit_vector(buf0, height, &v_shift0);
-      col_txfm(buf0, buf0, cos_bit_col, NULL);
-      round_shift_16bit_vector(buf0, height, &v_shift1);
-      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
-    }
-
-    for (int i = 0; i < 1; i++) {
-      int16x8_t *buf;
-      if (lr_flip) {
-        buf = buf0;
-        flip_buf_neon(buf1 + width * i, buf, width);
-      } else {
-        buf = buf1 + width * i;
-      }
-      row_txfm(buf, buf, cos_bit_row, NULL);
-      round_shift_16bit_vector(buf, width, &v_shift2);
-      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
-    }
-  } else {
+  if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+  for (int i = 0; i < 4; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 8);
+    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+  }
+
+  if (lr_flip) {
+    flip_buf_8_neon(buf1, buf0, 32);
+    row_txfm(buf0, output, 8, 12);
+  } else {
+    row_txfm(buf1, output, 8, 12);
   }
 }
 
-void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[64];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
-  const int txw_idx = get_txw_idx(TX_32X16);
-  const int txh_idx = get_txh_idx(TX_32X16);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 32;
-  const int height = 16;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    const int16x8_t v_shift0 = vdupq_n_s16(shift[0]);
-    const int16x8_t v_shift1 = vdupq_n_s16(shift[1]);
-    const int16x8_t v_shift2 = vdupq_n_s16(shift[2]);
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-    for (int i = 0; i < 4; i++) {
-      if (ud_flip) {
-        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-      } else {
-        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-      }
-      round_shift_16bit_vector(buf0, height, &v_shift0);
-      col_txfm(buf0, buf0, cos_bit_col, NULL);
-      round_shift_16bit_vector(buf0, height, &v_shift1);
-      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
-    }
-
-    for (int i = 0; i < 2; i++) {
-      int16x8_t *buf;
-      if (lr_flip) {
-        buf = buf0;
-        flip_buf_neon(buf1 + width * i, buf, width);
-      } else {
-        buf = buf1 + width * i;
-      }
-      row_txfm(buf, buf, cos_bit_row, NULL);
-      round_shift_16bit_vector(buf, width, &v_shift2);
-      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
-    }
-  } else {
+  if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+  for (int i = 0; i < 4; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 13);
+    shift_right_4_round_s16_x8(buf0, buf0, 16);
+    transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+      row_txfm(buf0, output + 8 * i, 16, 13);
+    } else {
+      int16x8_t *buf = buf1 + 32 * i;
+      row_txfm(buf, output + 8 * i, 16, 13);
+    }
   }
 }
 
-void av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[128];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
-  const int txw_idx = get_txw_idx(TX_32X32);
-  const int txh_idx = get_txh_idx(TX_32X32);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 32;
-  const int height = 32;
-  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-    for (int i = 0; i < 4; i++) {
-      if (ud_flip) {
-        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-      } else {
-        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-      }
-      round_shift_16bit(buf0, height, shift[0]);
-      col_txfm(buf0, buf0, cos_bit_col, NULL);
-      round_shift_16bit(buf0, height, shift[1]);
-      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
-    }
-
-    for (int i = 0; i < 4; i++) {
-      int16x8_t *buf;
-      if (lr_flip) {
-        buf = buf0;
-        flip_buf_neon(buf1 + width * i, buf, width);
-      } else {
-        buf = buf1 + width * i;
-      }
-      row_txfm(buf, buf, cos_bit_row, NULL);
-      round_shift_16bit(buf, width, shift[2]);
-      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
-    }
-  } else {
+  if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+    return;
+  }
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+  for (int i = 0; i < 4; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 12);
+    shift_right_4_round_s16_x8(buf0, buf0, 32);
+    transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
+    transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
+  }
+
+  for (int i = 0; i < 4; i++) {
+    if (lr_flip) {
+      flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+      row_txfm(buf0, output + 8 * i, 32, 12);
+    } else {
+      int16x8_t *buf = buf1 + 32 * i;
+      row_txfm(buf, output + 8 * i, 32, 12);
+    }
   }
 }
 
-void av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   (void)tx_type;
   assert(tx_type == DCT_DCT);
-  const TX_SIZE tx_size = TX_64X16;
   int16x8_t buf0[64], buf1[128];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  const transform_1d_lbd_neon col_txfm = fdct8x16_neon;
-  const transform_1d_lbd_neon row_txfm = av1_fdct8x64_neon;
-  const int width_div8 = (width >> 3);
-  const int height_div8 = (height >> 3);
+  const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
+  const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
 
-  for (int i = 0; i < width_div8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    round_shift_16bit(buf0, height, shift[0]);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit(buf0, height, shift[1]);
-    for (int j = 0; j < height_div8; ++j) {
-      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+  for (int i = 0; i < 8; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
+    shift_left_2_s16_x8(buf0, buf0, 16);
+    col_txfm(buf0, buf0, 13);
+    shift_right_4_round_s16_x8(buf0, buf0, 16);
+    for (int j = 0; j < 2; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
     }
   }
 
-  for (int i = 0; i < height_div8; i++) {
-    int16x8_t *buf = buf1 + width * i;
-    row_txfm(buf, buf, cos_bit_row, NULL);
-    round_shift_16bit(buf, width, shift[2]);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf = buf1 + 64 * i;
+    row_txfm(buf, buf, 12);
+    store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
   }
   // Zero out the bottom 16x32 area.
   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
 }
 
-void av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   (void)tx_type;
   assert(tx_type == DCT_DCT);
-  const TX_SIZE tx_size = TX_16X64;
   int16x8_t buf0[64], buf1[128];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
-  const transform_1d_lbd_neon row_txfm = fdct8x16_neon;
-  const int width_div8 = (width >> 3);
-  const int height_div8 = (height >> 3);
+  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+  const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
 
-  for (int i = 0; i < width_div8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    round_shift_16bit(buf0, height, shift[0]);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit(buf0, height, shift[1]);
-    for (int j = 0; j < height_div8; ++j) {
-      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+  for (int i = 0; i < 2; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+    col_txfm(buf0, buf0, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 64);
+    for (int j = 0; j < 8; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
     }
   }
 
-  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
-    int16x8_t *buf = buf1 + width * i;
-    row_txfm(buf, buf, cos_bit_row, NULL);
-    round_shift_16bit(buf, width, shift[2]);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+  for (int i = 0; i < 4; i++) {
+    int16x8_t *buf = buf1 + 16 * i;
+    row_txfm(buf, buf, 12);
+    store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
   }
 }
 
-#define TRANSPOSE_4X4_L32(x0, x1, x2, x3, y0, y1, y2, y3)      \
-  do {                                                         \
-    int32x4x2_t temp01 = vzipq_s32(x0, x1);                    \
-    int32x4x2_t temp23 = vzipq_s32(x2, x3);                    \
-    int32x4x2_t y01 = vzipq_s32(temp01.val[0], temp23.val[0]); \
-    int32x4x2_t y23 = vzipq_s32(temp01.val[1], temp23.val[1]); \
-    y0 = y01.val[0];                                           \
-    y1 = y01.val[1];                                           \
-    y2 = y23.val[0];                                           \
-    y3 = y23.val[1];                                           \
-  } while (0)
+static void fdct32_new_neon(const int32x4_t *input, int32x4_t *output,
+                            int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
 
-static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
-                                int cos_bit, const int stride,
-                                const int8_t *stage_range) {
-  (void)stage_range;
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
   int32x4_t buf0[32];
   int32x4_t buf1[32];
-  const int32_t *cospi;
-  cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
 
-  int startidx = 0 * stride;
-  int endidx = 31 * stride;
-  // stage 0
   // stage 1
-  buf1[0] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[31] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[1] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[30] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[2] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[29] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[3] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[28] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[4] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[27] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[5] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[26] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[6] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[25] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[7] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[24] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[8] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[23] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[9] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[22] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[10] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[21] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[11] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[20] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[12] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[19] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[13] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[18] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[14] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[17] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[15] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+  butterfly_dct_pre_s32_x4(input, buf1, 32);
 
   // stage 2
-  buf0[0] = vaddq_s32(buf1[0], buf1[15]);
-  buf0[15] = vsubq_s32(buf1[0], buf1[15]);
-  buf0[1] = vaddq_s32(buf1[1], buf1[14]);
-  buf0[14] = vsubq_s32(buf1[1], buf1[14]);
-  buf0[2] = vaddq_s32(buf1[2], buf1[13]);
-  buf0[13] = vsubq_s32(buf1[2], buf1[13]);
-  buf0[3] = vaddq_s32(buf1[3], buf1[12]);
-  buf0[12] = vsubq_s32(buf1[3], buf1[12]);
-  buf0[4] = vaddq_s32(buf1[4], buf1[11]);
-  buf0[11] = vsubq_s32(buf1[4], buf1[11]);
-  buf0[5] = vaddq_s32(buf1[5], buf1[10]);
-  buf0[10] = vsubq_s32(buf1[5], buf1[10]);
-  buf0[6] = vaddq_s32(buf1[6], buf1[9]);
-  buf0[9] = vsubq_s32(buf1[6], buf1[9]);
-  buf0[7] = vaddq_s32(buf1[7], buf1[8]);
-  buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+  butterfly_dct_pre_s32_x4(buf1, buf0, 16);
   buf0[16] = buf1[16];
   buf0[17] = buf1[17];
   buf0[18] = buf1[18];
   buf0[19] = buf1[19];
-  btf_32_neon_mode0(cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
-                    buf0[27], v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
-                    buf0[25], v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
-                    buf0[24], v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
+                                 &buf0[20]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
+                                 &buf0[21]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
+                                 &buf0[22]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
+                                 &buf0[23]);
   buf0[28] = buf1[28];
   buf0[29] = buf1[29];
   buf0[30] = buf1[30];
   buf0[31] = buf1[31];
 
   // stage 3
-  cospi = cospi_arr(cos_bit);
-  buf1[0] = vaddq_s32(buf0[0], buf0[7]);
-  buf1[7] = vsubq_s32(buf0[0], buf0[7]);
-  buf1[1] = vaddq_s32(buf0[1], buf0[6]);
-  buf1[6] = vsubq_s32(buf0[1], buf0[6]);
-  buf1[2] = vaddq_s32(buf0[2], buf0[5]);
-  buf1[5] = vsubq_s32(buf0[2], buf0[5]);
-  buf1[3] = vaddq_s32(buf0[3], buf0[4]);
-  buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+  butterfly_dct_pre_s32_x4(buf0, buf1, 8);
   buf1[8] = buf0[8];
   buf1[9] = buf0[9];
-  btf_32_neon_mode0(cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
-                    buf1[13], v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
-                    buf1[12], v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
+                                 &buf1[10]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
+                                 &buf1[11]);
   buf1[14] = buf0[14];
   buf1[15] = buf0[15];
-  buf1[16] = vaddq_s32(buf0[16], buf0[23]);
-  buf1[23] = vsubq_s32(buf0[16], buf0[23]);
-  buf1[17] = vaddq_s32(buf0[17], buf0[22]);
-  buf1[22] = vsubq_s32(buf0[17], buf0[22]);
-  buf1[18] = vaddq_s32(buf0[18], buf0[21]);
-  buf1[21] = vsubq_s32(buf0[18], buf0[21]);
-  buf1[19] = vaddq_s32(buf0[19], buf0[20]);
-  buf1[20] = vsubq_s32(buf0[19], buf0[20]);
-  buf1[24] = vsubq_s32(buf0[31], buf0[24]);
-  buf1[31] = vaddq_s32(buf0[31], buf0[24]);
-  buf1[25] = vsubq_s32(buf0[30], buf0[25]);
-  buf1[30] = vaddq_s32(buf0[30], buf0[25]);
-  buf1[26] = vsubq_s32(buf0[29], buf0[26]);
-  buf1[29] = vaddq_s32(buf0[29], buf0[26]);
-  buf1[27] = vsubq_s32(buf0[28], buf0[27]);
-  buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
 
   // stage 4
-  cospi = cospi_arr(cos_bit);
-  buf0[0] = vaddq_s32(buf1[0], buf1[3]);
-  buf0[3] = vsubq_s32(buf1[0], buf1[3]);
-  buf0[1] = vaddq_s32(buf1[1], buf1[2]);
-  buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+  butterfly_dct_pre_s32_x4(buf1, buf0, 4);
   buf0[4] = buf1[4];
-  btf_32_neon_mode0(cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
-                    v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
   buf0[7] = buf1[7];
-  buf0[8] = vaddq_s32(buf1[8], buf1[11]);
-  buf0[11] = vsubq_s32(buf1[8], buf1[11]);
-  buf0[9] = vaddq_s32(buf1[9], buf1[10]);
-  buf0[10] = vsubq_s32(buf1[9], buf1[10]);
-  buf0[12] = vsubq_s32(buf1[15], buf1[12]);
-  buf0[15] = vaddq_s32(buf1[15], buf1[12]);
-  buf0[13] = vsubq_s32(buf1[14], buf1[13]);
-  buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+  butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
   buf0[16] = buf1[16];
   buf0[17] = buf1[17];
-  btf_32_neon_mode0(cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
-                    buf0[29], v_cos_bit);
-  btf_32_neon_mode0(cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
-                    buf0[28], v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], buf1[20], buf1[27], buf0[20],
-                     buf0[27], v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], buf1[21], buf1[26], buf0[21],
-                     buf0[26], v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
+                                 &buf0[18]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
+                                 &buf0[19]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
+                                 &buf0[20]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
+                                 &buf0[21]);
   buf0[22] = buf1[22];
   buf0[23] = buf1[23];
   buf0[24] = buf1[24];
@@ -3202,69 +2467,40 @@
   buf0[31] = buf1[31];
 
   // stage 5
-  cospi = cospi_arr(cos_bit);
-  btf_32_neon(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
-              v_cos_bit);
-  btf_32_type1_neon(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
-                    v_cos_bit);
-  buf1[4] = vaddq_s32(buf0[4], buf0[5]);
-  buf1[5] = vsubq_s32(buf0[4], buf0[5]);
-  buf1[6] = vsubq_s32(buf0[7], buf0[6]);
-  buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
+  butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
   buf1[8] = buf0[8];
-  btf_32_neon_mode0(cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], buf0[10], buf0[13], buf1[10],
-                     buf1[13], v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
+                                 &buf1[9]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
+                                 &buf1[10]);
   buf1[11] = buf0[11];
   buf1[12] = buf0[12];
   buf1[15] = buf0[15];
-  buf1[16] = vaddq_s32(buf0[16], buf0[19]);
-  buf1[19] = vsubq_s32(buf0[16], buf0[19]);
-  buf1[17] = vaddq_s32(buf0[17], buf0[18]);
-  buf1[18] = vsubq_s32(buf0[17], buf0[18]);
-  buf1[20] = vsubq_s32(buf0[23], buf0[20]);
-  buf1[23] = vaddq_s32(buf0[23], buf0[20]);
-  buf1[21] = vsubq_s32(buf0[22], buf0[21]);
-  buf1[22] = vaddq_s32(buf0[22], buf0[21]);
-  buf1[24] = vaddq_s32(buf0[24], buf0[27]);
-  buf1[27] = vsubq_s32(buf0[24], buf0[27]);
-  buf1[25] = vaddq_s32(buf0[25], buf0[26]);
-  buf1[26] = vsubq_s32(buf0[25], buf0[26]);
-  buf1[28] = vsubq_s32(buf0[31], buf0[28]);
-  buf1[31] = vaddq_s32(buf0[31], buf0[28]);
-  buf1[29] = vsubq_s32(buf0[30], buf0[29]);
-  buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+  butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
 
   // stage 6
-  cospi = cospi_arr(cos_bit);
   buf0[0] = buf1[0];
   buf0[1] = buf1[1];
   buf0[2] = buf1[2];
   buf0[3] = buf1[3];
-  btf_32_type1_neon(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
-                    v_cos_bit);
-  buf0[8] = vaddq_s32(buf1[8], buf1[9]);
-  buf0[9] = vsubq_s32(buf1[8], buf1[9]);
-  buf0[10] = vsubq_s32(buf1[11], buf1[10]);
-  buf0[11] = vaddq_s32(buf1[11], buf1[10]);
-  buf0[12] = vaddq_s32(buf1[12], buf1[13]);
-  buf0[13] = vsubq_s32(buf1[12], buf1[13]);
-  buf0[14] = vsubq_s32(buf1[15], buf1[14]);
-  buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+  butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
+  butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+  butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
   buf0[16] = buf1[16];
-  btf_32_neon_mode0(cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[56], cospi[8], buf1[18], buf1[29], buf0[18],
-                     buf0[29], v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
+                                 &buf0[17]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
+                                 &buf0[18]);
   buf0[19] = buf1[19];
   buf0[20] = buf1[20];
-  btf_32_neon_mode0(cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-  btf_32_neon_mode01(cospi[24], cospi[40], buf1[22], buf1[25], buf0[22],
-                     buf0[25], v_cos_bit);
+  butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
+                                 &buf0[21]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
+                                 &buf0[22]);
   buf0[23] = buf1[23];
   buf0[24] = buf1[24];
   buf0[27] = buf1[27];
@@ -3272,7 +2508,6 @@
   buf0[31] = buf1[31];
 
   // stage 7
-  cospi = cospi_arr(cos_bit);
   buf1[0] = buf0[0];
   buf1[1] = buf0[1];
   buf1[2] = buf0[2];
@@ -3281,34 +2516,20 @@
   buf1[5] = buf0[5];
   buf1[6] = buf0[6];
   buf1[7] = buf0[7];
-
-  btf_32_type1_neon(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
-                    buf1[13], v_cos_bit);
-  btf_32_type1_neon(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
-                    buf1[12], v_cos_bit);
-  buf1[16] = vaddq_s32(buf0[16], buf0[17]);
-  buf1[17] = vsubq_s32(buf0[16], buf0[17]);
-  buf1[18] = vsubq_s32(buf0[19], buf0[18]);
-  buf1[19] = vaddq_s32(buf0[19], buf0[18]);
-  buf1[20] = vaddq_s32(buf0[20], buf0[21]);
-  buf1[21] = vsubq_s32(buf0[20], buf0[21]);
-  buf1[22] = vsubq_s32(buf0[23], buf0[22]);
-  buf1[23] = vaddq_s32(buf0[23], buf0[22]);
-  buf1[24] = vaddq_s32(buf0[24], buf0[25]);
-  buf1[25] = vsubq_s32(buf0[24], buf0[25]);
-  buf1[26] = vsubq_s32(buf0[27], buf0[26]);
-  buf1[27] = vaddq_s32(buf0[27], buf0[26]);
-  buf1[28] = vaddq_s32(buf0[28], buf0[29]);
-  buf1[29] = vsubq_s32(buf0[28], buf0[29]);
-  buf1[30] = vsubq_s32(buf0[31], buf0[30]);
-  buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+  butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
+                                 &buf1[15]);
+  butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
+                                 &buf1[14]);
+  butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
+                                 &buf1[13]);
+  butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
+                                 &buf1[12]);
+  butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+  butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+  butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+  butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
 
   // stage 8
-  cospi = cospi_arr(cos_bit);
   buf0[0] = buf1[0];
   buf0[1] = buf1[1];
   buf0[2] = buf1[2];
@@ -3325,988 +2546,408 @@
   buf0[13] = buf1[13];
   buf0[14] = buf1[14];
   buf0[15] = buf1[15];
+  butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
+                                 &buf0[31]);
+  butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
+                                 &buf0[30]);
+  butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
+                                 &buf0[29]);
+  butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
+                                 &buf0[28]);
+  butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
+                                 &buf0[27]);
+  butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
+                                 &buf0[26]);
+  butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
+                                 &buf0[25]);
+  butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
+                                 &buf0[24]);
 
-  btf_32_type1_neon(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
-                    buf0[30], v_cos_bit);
-  btf_32_type1_neon(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
-                    buf0[29], v_cos_bit);
-  btf_32_type1_neon(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
-                    buf0[28], v_cos_bit);
-  btf_32_type1_neon(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
-                    buf0[27], v_cos_bit);
-  btf_32_type1_neon(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-  btf_32_type1_neon(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
-                    buf0[25], v_cos_bit);
-  btf_32_type1_neon(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
-                    v_cos_bit);
-
-  startidx = 0 * stride;
-  endidx = 31 * stride;
   // stage 9
-  output[startidx] = buf0[0];
-  output[endidx] = buf0[31];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[16];
-  output[endidx] = buf0[15];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[8];
-  output[endidx] = buf0[23];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[24];
-  output[endidx] = buf0[7];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[4];
-  output[endidx] = buf0[27];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[20];
-  output[endidx] = buf0[11];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[12];
-  output[endidx] = buf0[19];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[28];
-  output[endidx] = buf0[3];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[2];
-  output[endidx] = buf0[29];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[18];
-  output[endidx] = buf0[13];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[10];
-  output[endidx] = buf0[21];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[26];
-  output[endidx] = buf0[5];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[6];
-  output[endidx] = buf0[25];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[22];
-  output[endidx] = buf0[9];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[14];
-  output[endidx] = buf0[17];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[30];
-  output[endidx] = buf0[1];
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
 }
 
-static void av1_fdct64_new_stage1234_neon(int32x4_t *input, const int instride,
-                                          int32x4_t *x3, int32x4_t *x4,
-                                          const int32_t *cospi,
-                                          const int32x4_t *v_cos_bit,
-                                          int *startidx, int *endidx) {
+static void fdct64_new_neon(const int32x4_t *input, int32x4_t *output,
+                            int cos_bit) {
+  const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
   // stage 1
   int32x4_t x1[64];
-  x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+  butterfly_dct_pre_s32_x4(input, x1, 64);
 
   // stage 2
   int32x4_t x2[64];
-  x2[0] = vaddq_s32(x1[0], x1[31]);
-  x2[31] = vsubq_s32(x1[0], x1[31]);
-  x2[1] = vaddq_s32(x1[1], x1[30]);
-  x2[30] = vsubq_s32(x1[1], x1[30]);
-  x2[2] = vaddq_s32(x1[2], x1[29]);
-  x2[29] = vsubq_s32(x1[2], x1[29]);
-  x2[3] = vaddq_s32(x1[3], x1[28]);
-  x2[28] = vsubq_s32(x1[3], x1[28]);
-  x2[4] = vaddq_s32(x1[4], x1[27]);
-  x2[27] = vsubq_s32(x1[4], x1[27]);
-  x2[5] = vaddq_s32(x1[5], x1[26]);
-  x2[26] = vsubq_s32(x1[5], x1[26]);
-  x2[6] = vaddq_s32(x1[6], x1[25]);
-  x2[25] = vsubq_s32(x1[6], x1[25]);
-  x2[7] = vaddq_s32(x1[7], x1[24]);
-  x2[24] = vsubq_s32(x1[7], x1[24]);
-  x2[8] = vaddq_s32(x1[8], x1[23]);
-  x2[23] = vsubq_s32(x1[8], x1[23]);
-  x2[9] = vaddq_s32(x1[9], x1[22]);
-  x2[22] = vsubq_s32(x1[9], x1[22]);
-  x2[10] = vaddq_s32(x1[10], x1[21]);
-  x2[21] = vsubq_s32(x1[10], x1[21]);
-  x2[11] = vaddq_s32(x1[11], x1[20]);
-  x2[20] = vsubq_s32(x1[11], x1[20]);
-  x2[12] = vaddq_s32(x1[12], x1[19]);
-  x2[19] = vsubq_s32(x1[12], x1[19]);
-  x2[13] = vaddq_s32(x1[13], x1[18]);
-  x2[18] = vsubq_s32(x1[13], x1[18]);
-  x2[14] = vaddq_s32(x1[14], x1[17]);
-  x2[17] = vsubq_s32(x1[14], x1[17]);
-  x2[15] = vaddq_s32(x1[15], x1[16]);
-  x2[16] = vsubq_s32(x1[15], x1[16]);
-
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
-                    *v_cos_bit);
+  butterfly_dct_pre_s32_x4(x1, x2, 32);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
 
   // stage 3
-  x3[0] = vaddq_s32(x2[0], x2[15]);
-  x3[15] = vsubq_s32(x2[0], x2[15]);
-  x3[1] = vaddq_s32(x2[1], x2[14]);
-  x3[14] = vsubq_s32(x2[1], x2[14]);
-  x3[2] = vaddq_s32(x2[2], x2[13]);
-  x3[13] = vsubq_s32(x2[2], x2[13]);
-  x3[3] = vaddq_s32(x2[3], x2[12]);
-  x3[12] = vsubq_s32(x2[3], x2[12]);
-  x3[4] = vaddq_s32(x2[4], x2[11]);
-  x3[11] = vsubq_s32(x2[4], x2[11]);
-  x3[5] = vaddq_s32(x2[5], x2[10]);
-  x3[10] = vsubq_s32(x2[5], x2[10]);
-  x3[6] = vaddq_s32(x2[6], x2[9]);
-  x3[9] = vsubq_s32(x2[6], x2[9]);
-  x3[7] = vaddq_s32(x2[7], x2[8]);
-  x3[8] = vsubq_s32(x2[7], x2[8]);
-
-  btf_32_neon_mode0(cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
-                    *v_cos_bit);
-
-  x3[32] = vaddq_s32(x1[32], x2[47]);
-  x3[47] = vsubq_s32(x1[32], x2[47]);
-  x3[33] = vaddq_s32(x1[33], x2[46]);
-  x3[46] = vsubq_s32(x1[33], x2[46]);
-  x3[34] = vaddq_s32(x1[34], x2[45]);
-  x3[45] = vsubq_s32(x1[34], x2[45]);
-  x3[35] = vaddq_s32(x1[35], x2[44]);
-  x3[44] = vsubq_s32(x1[35], x2[44]);
-  x3[36] = vaddq_s32(x1[36], x2[43]);
-  x3[43] = vsubq_s32(x1[36], x2[43]);
-  x3[37] = vaddq_s32(x1[37], x2[42]);
-  x3[42] = vsubq_s32(x1[37], x2[42]);
-  x3[38] = vaddq_s32(x1[38], x2[41]);
-  x3[41] = vsubq_s32(x1[38], x2[41]);
-  x3[39] = vaddq_s32(x1[39], x2[40]);
-  x3[40] = vsubq_s32(x1[39], x2[40]);
-  x3[48] = vsubq_s32(x1[63], x2[48]);
-  x3[63] = vaddq_s32(x1[63], x2[48]);
-  x3[49] = vsubq_s32(x1[62], x2[49]);
-  x3[62] = vaddq_s32(x1[62], x2[49]);
-  x3[50] = vsubq_s32(x1[61], x2[50]);
-  x3[61] = vaddq_s32(x1[61], x2[50]);
-  x3[51] = vsubq_s32(x1[60], x2[51]);
-  x3[60] = vaddq_s32(x1[60], x2[51]);
-  x3[52] = vsubq_s32(x1[59], x2[52]);
-  x3[59] = vaddq_s32(x1[59], x2[52]);
-  x3[53] = vsubq_s32(x1[58], x2[53]);
-  x3[58] = vaddq_s32(x1[58], x2[53]);
-  x3[54] = vsubq_s32(x1[57], x2[54]);
-  x3[57] = vaddq_s32(x1[57], x2[54]);
-  x3[55] = vsubq_s32(x1[56], x2[55]);
-  x3[56] = vaddq_s32(x1[56], x2[55]);
+  int32x4_t x3[64];
+  butterfly_dct_pre_s32_x4(x2, x3, 16);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+  butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
 
   // stage 4
-  x4[0] = vaddq_s32(x3[0], x3[7]);
-  x4[7] = vsubq_s32(x3[0], x3[7]);
-  x4[1] = vaddq_s32(x3[1], x3[6]);
-  x4[6] = vsubq_s32(x3[1], x3[6]);
-  x4[2] = vaddq_s32(x3[2], x3[5]);
-  x4[5] = vsubq_s32(x3[2], x3[5]);
-  x4[3] = vaddq_s32(x3[3], x3[4]);
-  x4[4] = vsubq_s32(x3[3], x3[4]);
-
-  btf_32_neon_mode0(cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
-                    *v_cos_bit);
-
-  x4[16] = vaddq_s32(x2[16], x3[23]);
-  x4[23] = vsubq_s32(x2[16], x3[23]);
-  x4[17] = vaddq_s32(x2[17], x3[22]);
-  x4[22] = vsubq_s32(x2[17], x3[22]);
-  x4[18] = vaddq_s32(x2[18], x3[21]);
-  x4[21] = vsubq_s32(x2[18], x3[21]);
-  x4[19] = vaddq_s32(x2[19], x3[20]);
-  x4[20] = vsubq_s32(x2[19], x3[20]);
-  x4[24] = vsubq_s32(x2[31], x3[24]);
-  x4[31] = vaddq_s32(x2[31], x3[24]);
-  x4[25] = vsubq_s32(x2[30], x3[25]);
-  x4[30] = vaddq_s32(x2[30], x3[25]);
-  x4[26] = vsubq_s32(x2[29], x3[26]);
-  x4[29] = vaddq_s32(x2[29], x3[26]);
-  x4[27] = vsubq_s32(x2[28], x3[27]);
-  x4[28] = vaddq_s32(x2[28], x3[27]);
-
-  btf_32_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
-                    *v_cos_bit);
-  btf_32_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
-                    *v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
-                     *v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
-                     *v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
-                     *v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
-                     *v_cos_bit);
-}
-
-static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
-                                int8_t cos_bit, const int instride,
-                                const int outstride,
-                                const int8_t *stage_range) {
-  (void)stage_range;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
-  int startidx = 0 * instride;
-  int endidx = 63 * instride;
-
-  // stage 1-2-3-4
-  int32x4_t x3[64], x4[64];
-  av1_fdct64_new_stage1234_neon(input, instride, x3, x4, cospi, &v_cos_bit,
-                                &startidx, &endidx);
+  int32x4_t x4[64];
+  butterfly_dct_pre_s32_x4(x3, x4, 8);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+  butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
 
   // stage 5
   int32x4_t x5[64];
-  x5[0] = vaddq_s32(x4[0], x4[3]);
-  x5[3] = vsubq_s32(x4[0], x4[3]);
-  x5[1] = vaddq_s32(x4[1], x4[2]);
-  x5[2] = vsubq_s32(x4[1], x4[2]);
-
-  btf_32_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
-                    v_cos_bit);
-
-  x5[8] = vaddq_s32(x3[8], x4[11]);
-  x5[11] = vsubq_s32(x3[8], x4[11]);
-  x5[9] = vaddq_s32(x3[9], x4[10]);
-  x5[10] = vsubq_s32(x3[9], x4[10]);
-  x5[12] = vsubq_s32(x3[15], x4[12]);
-  x5[15] = vaddq_s32(x3[15], x4[12]);
-  x5[13] = vsubq_s32(x3[14], x4[13]);
-  x5[14] = vaddq_s32(x3[14], x4[13]);
-
-  btf_32_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
-                    v_cos_bit);
-  btf_32_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
-                     v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
-                     v_cos_bit);
-
-  x5[32] = vaddq_s32(x3[32], x4[39]);
-  x5[39] = vsubq_s32(x3[32], x4[39]);
-  x5[33] = vaddq_s32(x3[33], x4[38]);
-  x5[38] = vsubq_s32(x3[33], x4[38]);
-  x5[34] = vaddq_s32(x3[34], x4[37]);
-  x5[37] = vsubq_s32(x3[34], x4[37]);
-  x5[35] = vaddq_s32(x3[35], x4[36]);
-  x5[36] = vsubq_s32(x3[35], x4[36]);
-  x5[40] = vsubq_s32(x3[47], x4[40]);
-  x5[47] = vaddq_s32(x3[47], x4[40]);
-  x5[41] = vsubq_s32(x3[46], x4[41]);
-  x5[46] = vaddq_s32(x3[46], x4[41]);
-  x5[42] = vsubq_s32(x3[45], x4[42]);
-  x5[45] = vaddq_s32(x3[45], x4[42]);
-  x5[43] = vsubq_s32(x3[44], x4[43]);
-  x5[44] = vaddq_s32(x3[44], x4[43]);
-  x5[48] = vaddq_s32(x3[48], x4[55]);
-  x5[55] = vsubq_s32(x3[48], x4[55]);
-  x5[49] = vaddq_s32(x3[49], x4[54]);
-  x5[54] = vsubq_s32(x3[49], x4[54]);
-  x5[50] = vaddq_s32(x3[50], x4[53]);
-  x5[53] = vsubq_s32(x3[50], x4[53]);
-  x5[51] = vaddq_s32(x3[51], x4[52]);
-  x5[52] = vsubq_s32(x3[51], x4[52]);
-  x5[56] = vsubq_s32(x3[63], x4[56]);
-  x5[63] = vaddq_s32(x3[63], x4[56]);
-  x5[57] = vsubq_s32(x3[62], x4[57]);
-  x5[62] = vaddq_s32(x3[62], x4[57]);
-  x5[58] = vsubq_s32(x3[61], x4[58]);
-  x5[61] = vaddq_s32(x3[61], x4[58]);
-  x5[59] = vsubq_s32(x3[60], x4[59]);
-  x5[60] = vaddq_s32(x3[60], x4[59]);
+  butterfly_dct_pre_s32_x4(x4, x5, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+  butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+  butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
+  butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
 
   // stage 6
   int32x4_t x6[64];
-  btf_32_neon(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1], v_cos_bit);
-  btf_32_type1_neon(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
-                    v_cos_bit);
-  x6[4] = vaddq_s32(x4[4], x5[5]);
-  x6[5] = vsubq_s32(x4[4], x5[5]);
-  x6[6] = vsubq_s32(x4[7], x5[6]);
-  x6[7] = vaddq_s32(x4[7], x5[6]);
-  btf_32_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
-                     v_cos_bit);
-
-  x6[16] = vaddq_s32(x4[16], x5[19]);
-  x6[19] = vsubq_s32(x4[16], x5[19]);
-  x6[17] = vaddq_s32(x4[17], x5[18]);
-  x6[18] = vsubq_s32(x4[17], x5[18]);
-  x6[20] = vsubq_s32(x4[23], x5[20]);
-  x6[23] = vaddq_s32(x4[23], x5[20]);
-  x6[21] = vsubq_s32(x4[22], x5[21]);
-  x6[22] = vaddq_s32(x4[22], x5[21]);
-  x6[24] = vaddq_s32(x4[24], x5[27]);
-  x6[27] = vsubq_s32(x4[24], x5[27]);
-  x6[25] = vaddq_s32(x4[25], x5[26]);
-  x6[26] = vsubq_s32(x4[25], x5[26]);
-  x6[28] = vsubq_s32(x4[31], x5[28]);
-  x6[31] = vaddq_s32(x4[31], x5[28]);
-  x6[29] = vsubq_s32(x4[30], x5[29]);
-  x6[30] = vaddq_s32(x4[30], x5[29]);
-
-  btf_32_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
-                    v_cos_bit);
-  btf_32_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
-                     v_cos_bit);
-  btf_32_neon_mode01(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
-                     v_cos_bit);
-  btf_32_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
-                    v_cos_bit);
-  btf_32_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
-                     v_cos_bit);
-  btf_32_neon_mode01(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
-                     v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+  butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+  butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+  butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
+  butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
 
   // stage 7
   int32x4_t x7[64];
-
-  btf_32_type1_neon(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
-  btf_32_type1_neon(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
-                    v_cos_bit);
-  x7[8] = vaddq_s32(x5[8], x6[9]);
-  x7[9] = vsubq_s32(x5[8], x6[9]);
-  x7[10] = vsubq_s32(x5[11], x6[10]);
-  x7[11] = vaddq_s32(x5[11], x6[10]);
-  x7[12] = vaddq_s32(x5[12], x6[13]);
-  x7[13] = vsubq_s32(x5[12], x6[13]);
-  x7[14] = vsubq_s32(x5[15], x6[14]);
-  x7[15] = vaddq_s32(x5[15], x6[14]);
-
-  btf_32_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
-                     v_cos_bit);
-
-  btf_32_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
-                     v_cos_bit);
-
-  x7[32] = vaddq_s32(x5[32], x6[35]);
-  x7[35] = vsubq_s32(x5[32], x6[35]);
-  x7[33] = vaddq_s32(x5[33], x6[34]);
-  x7[34] = vsubq_s32(x5[33], x6[34]);
-  x7[36] = vsubq_s32(x5[39], x6[36]);
-  x7[39] = vaddq_s32(x5[39], x6[36]);
-  x7[37] = vsubq_s32(x5[38], x6[37]);
-  x7[38] = vaddq_s32(x5[38], x6[37]);
-  x7[40] = vaddq_s32(x5[40], x6[43]);
-  x7[43] = vsubq_s32(x5[40], x6[43]);
-  x7[41] = vaddq_s32(x5[41], x6[42]);
-  x7[42] = vsubq_s32(x5[41], x6[42]);
-  x7[44] = vsubq_s32(x5[47], x6[44]);
-  x7[47] = vaddq_s32(x5[47], x6[44]);
-  x7[45] = vsubq_s32(x5[46], x6[45]);
-  x7[46] = vaddq_s32(x5[46], x6[45]);
-  x7[48] = vaddq_s32(x5[48], x6[51]);
-  x7[51] = vsubq_s32(x5[48], x6[51]);
-  x7[49] = vaddq_s32(x5[49], x6[50]);
-  x7[50] = vsubq_s32(x5[49], x6[50]);
-  x7[52] = vsubq_s32(x5[55], x6[52]);
-  x7[55] = vaddq_s32(x5[55], x6[52]);
-  x7[53] = vsubq_s32(x5[54], x6[53]);
-  x7[54] = vaddq_s32(x5[54], x6[53]);
-  x7[56] = vaddq_s32(x5[56], x6[59]);
-  x7[59] = vsubq_s32(x5[56], x6[59]);
-  x7[57] = vaddq_s32(x5[57], x6[58]);
-  x7[58] = vsubq_s32(x5[57], x6[58]);
-  x7[60] = vsubq_s32(x5[63], x6[60]);
-  x7[63] = vaddq_s32(x5[63], x6[60]);
-  x7[61] = vsubq_s32(x5[62], x6[61]);
-  x7[62] = vaddq_s32(x5[62], x6[61]);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+  butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
+  butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+  butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+  butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+  butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+  butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
+  butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
+  butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
+  butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
 
   // stage 8
   int32x4_t x8[64];
-
-  btf_32_type1_neon(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
-                    v_cos_bit);
-  x8[16] = vaddq_s32(x6[16], x7[17]);
-  x8[17] = vsubq_s32(x6[16], x7[17]);
-  x8[18] = vsubq_s32(x6[19], x7[18]);
-  x8[19] = vaddq_s32(x6[19], x7[18]);
-  x8[20] = vaddq_s32(x6[20], x7[21]);
-  x8[21] = vsubq_s32(x6[20], x7[21]);
-  x8[22] = vsubq_s32(x6[23], x7[22]);
-  x8[23] = vaddq_s32(x6[23], x7[22]);
-  x8[24] = vaddq_s32(x6[24], x7[25]);
-  x8[25] = vsubq_s32(x6[24], x7[25]);
-  x8[26] = vsubq_s32(x6[27], x7[26]);
-  x8[27] = vaddq_s32(x6[27], x7[26]);
-  x8[28] = vaddq_s32(x6[28], x7[29]);
-  x8[29] = vsubq_s32(x6[28], x7[29]);
-  x8[30] = vsubq_s32(x6[31], x7[30]);
-  x8[31] = vaddq_s32(x6[31], x7[30]);
-
-  btf_32_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
-                     v_cos_bit);
-  btf_32_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
-                     v_cos_bit);
-  btf_32_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
-                     v_cos_bit);
-  btf_32_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
-                    v_cos_bit);
-  btf_32_neon_mode01(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
-                     v_cos_bit);
+  butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+  butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+  butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+  butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+  butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
+  butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
+  butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
+  butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
+  butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+  butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+  butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+  butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+  butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+  butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+  butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+  butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
 
   // stage 9
   int32x4_t x9[64];
-
-  btf_32_type1_neon(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
-                    v_cos_bit);
-  x9[32] = vaddq_s32(x7[32], x8[33]);
-  x9[33] = vsubq_s32(x7[32], x8[33]);
-  x9[34] = vsubq_s32(x7[35], x8[34]);
-  x9[35] = vaddq_s32(x7[35], x8[34]);
-  x9[36] = vaddq_s32(x7[36], x8[37]);
-  x9[37] = vsubq_s32(x7[36], x8[37]);
-  x9[38] = vsubq_s32(x7[39], x8[38]);
-  x9[39] = vaddq_s32(x7[39], x8[38]);
-  x9[40] = vaddq_s32(x7[40], x8[41]);
-  x9[41] = vsubq_s32(x7[40], x8[41]);
-  x9[42] = vsubq_s32(x7[43], x8[42]);
-  x9[43] = vaddq_s32(x7[43], x8[42]);
-  x9[44] = vaddq_s32(x7[44], x8[45]);
-  x9[45] = vsubq_s32(x7[44], x8[45]);
-  x9[46] = vsubq_s32(x7[47], x8[46]);
-  x9[47] = vaddq_s32(x7[47], x8[46]);
-  x9[48] = vaddq_s32(x7[48], x8[49]);
-  x9[49] = vsubq_s32(x7[48], x8[49]);
-  x9[50] = vsubq_s32(x7[51], x8[50]);
-  x9[51] = vaddq_s32(x7[51], x8[50]);
-  x9[52] = vaddq_s32(x7[52], x8[53]);
-  x9[53] = vsubq_s32(x7[52], x8[53]);
-  x9[54] = vsubq_s32(x7[55], x8[54]);
-  x9[55] = vaddq_s32(x7[55], x8[54]);
-  x9[56] = vaddq_s32(x7[56], x8[57]);
-  x9[57] = vsubq_s32(x7[56], x8[57]);
-  x9[58] = vsubq_s32(x7[59], x8[58]);
-  x9[59] = vaddq_s32(x7[59], x8[58]);
-  x9[60] = vaddq_s32(x7[60], x8[61]);
-  x9[61] = vsubq_s32(x7[60], x8[61]);
-  x9[62] = vsubq_s32(x7[63], x8[62]);
-  x9[63] = vaddq_s32(x7[63], x8[62]);
+  butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+  butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+  butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+  butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+  butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+  butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+  butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+  butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+  butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
+  butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
+  butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
+  butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
+  butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
+  butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
+  butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
+  butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
 
   // stage 10
   int32x4_t x10[64];
+  butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
+  butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
+  butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
+  butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
+  butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
+  butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
+  butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
+  butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
+  butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
+  butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
+  butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
+  butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
+  butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
+  butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
+  butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
+  butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
 
-  btf_32_type1_neon(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
-                    v_cos_bit);
-  btf_32_type1_neon(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
-                    v_cos_bit);
-
-  startidx = 0 * outstride;
-  endidx = 63 * outstride;
-  // stage 11
-  output[startidx] = x6[0];
-  output[endidx] = x10[63];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[32];
-  output[endidx] = x9[31];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[16];
-  output[endidx] = x10[47];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[48];
-  output[endidx] = x8[15];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x8[8];
-  output[endidx] = x10[55];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[40];
-  output[endidx] = x9[23];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[24];
-  output[endidx] = x10[39];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[56];
-  output[endidx] = x7[7];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x7[4];
-  output[endidx] = x10[59];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[36];
-  output[endidx] = x9[27];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[20];
-  output[endidx] = x10[43];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[52];
-  output[endidx] = x8[11];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x8[12];
-  output[endidx] = x10[51];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[44];
-  output[endidx] = x9[19];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[28];
-  output[endidx] = x10[35];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[60];
-  output[endidx] = x6[3];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x6[2];
-  output[endidx] = x10[61];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[34];
-  output[endidx] = x9[29];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[18];
-  output[endidx] = x10[45];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[50];
-  output[endidx] = x8[13];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x8[10];
-  output[endidx] = x10[53];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[42];
-  output[endidx] = x9[21];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[26];
-  output[endidx] = x10[37];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[58];
-  output[endidx] = x7[5];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x7[6];
-  output[endidx] = x10[57];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[38];
-  output[endidx] = x9[25];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[22];
-  output[endidx] = x10[41];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[54];
-  output[endidx] = x8[9];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x8[14];
-  output[endidx] = x10[49];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[46];
-  output[endidx] = x9[17];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x9[30];
-  output[endidx] = x10[33];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[62];
-  output[endidx] = x6[1];
+  // stage 11, only store into the low 32 output indices.
+  output[0] = x6[0];
+  output[1] = x10[32];
+  output[2] = x9[16];
+  output[3] = x10[48];
+  output[4] = x8[8];
+  output[5] = x10[40];
+  output[6] = x9[24];
+  output[7] = x10[56];
+  output[8] = x7[4];
+  output[9] = x10[36];
+  output[10] = x9[20];
+  output[11] = x10[52];
+  output[12] = x8[12];
+  output[13] = x10[44];
+  output[14] = x9[28];
+  output[15] = x10[60];
+  output[16] = x6[2];
+  output[17] = x10[34];
+  output[18] = x9[18];
+  output[19] = x10[50];
+  output[20] = x8[10];
+  output[21] = x10[42];
+  output[22] = x9[26];
+  output[23] = x10[58];
+  output[24] = x7[6];
+  output[25] = x10[38];
+  output[26] = x9[22];
+  output[27] = x10[54];
+  output[28] = x8[14];
+  output[29] = x10[46];
+  output[30] = x9[30];
+  output[31] = x10[62];
 }
 
-static void av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t *input,
-                                            int32_t *output, int stride,
-                                            TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   (void)tx_type;
   assert(tx_type == DCT_DCT);
-  const TX_SIZE tx_size = TX_64X64;
   int16x8_t buf0[64], buf1[512];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
-  const int width_div8 = (width >> 3);
-  const int height_div8 = (height >> 3);
+  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
 
-  for (int i = 0; i < width_div8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    round_shift_16bit(buf0, height, shift[0]);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit(buf0, height, shift[1]);
-    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
-      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+  for (int i = 0; i < 8; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+    col_txfm(buf0, buf0, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 64);
+    for (int j = 0; j < 4; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
     }
   }
-  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+  for (int i = 0; i < 4; i++) {
     int32x4_t bufA[64];
     int32x4_t bufB[64];
-    int16x8_t *buf = buf1 + width * i;
-    for (int j = 0; j < width; ++j) {
+    int16x8_t *buf = buf1 + 64 * i;
+    for (int j = 0; j < 64; ++j) {
       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
     }
-    av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
-    av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
-    av1_round_shift_array_32_neon(bufA, bufA, 32);
-    av1_round_shift_array_32_neon(bufB, bufB, 32);
-
-    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+    fdct64_new_neon(bufA, bufA, 10);
+    fdct64_new_neon(bufB, bufB, 10);
+    shift_right_2_round_s32_x4(bufA, bufA, 32);
+    shift_right_2_round_s32_x4(bufB, bufB, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   }
 }
-static void av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t *input,
-                                            int32_t *output, int stride,
-                                            TX_TYPE tx_type, int bd) {
-  (void)bd;
-  const TX_SIZE tx_size = TX_64X32;
-  int16x8_t buf0[64], buf1[256];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const int width_div8 = (width >> 3);
-  const int height_div8 = (height >> 3);
 
-  for (int i = 0; i < width_div8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    round_shift_16bit(buf0, height, shift[0]);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit(buf0, height, shift[1]);
-    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
-      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[64], buf1[256];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+
+  for (int i = 0; i < 8; i++) {
+    col_txfm(input + 8 * i, buf0, stride, 12);
+    shift_right_4_round_s16_x8(buf0, buf0, 32);
+    for (int j = 0; j < 4; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
     }
   }
   assert(tx_type == DCT_DCT);
-  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+  for (int i = 0; i < 4; i++) {
     int32x4_t bufA[64];
     int32x4_t bufB[64];
-    int16x8_t *buf = buf1 + width * i;
-    for (int j = 0; j < width; ++j) {
+    int16x8_t *buf = buf1 + 64 * i;
+    for (int j = 0; j < 64; ++j) {
       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
     }
-    av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
-    av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
-    av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
-    av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
-
-    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+    fdct64_new_neon(bufA, bufA, 11);
+    fdct64_new_neon(bufB, bufB, 11);
+    shift_right_2_round_s32_x4(bufA, bufA, 32);
+    shift_right_2_round_s32_x4(bufB, bufB, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   }
 }
 
-static void av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t *input,
-                                            int32_t *output, int stride,
-                                            TX_TYPE tx_type, int bd) {
+static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   (void)tx_type;
   assert(tx_type == DCT_DCT);
-  const TX_SIZE tx_size = TX_32X64;
   int16x8_t buf0[64], buf1[256];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
-  const int width_div8 = (width >> 3);
-  const int height_div8 = (height >> 3);
+  const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
 
-  for (int i = 0; i < width_div8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-    round_shift_16bit(buf0, height, shift[0]);
-    col_txfm(buf0, buf0, cos_bit_col, NULL);
-    round_shift_16bit(buf0, height, shift[1]);
-    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
-      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+  for (int i = 0; i < 4; i++) {
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+    col_txfm(buf0, buf0, 13);
+    shift_right_2_round_s16_x8(buf0, buf0, 64);
+    for (int j = 0; j < 4; ++j) {
+      transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
     }
   }
 
-  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+  for (int i = 0; i < 4; i++) {
     int32x4_t bufA[32];
     int32x4_t bufB[32];
-    int16x8_t *buf = buf1 + width * i;
-    for (int j = 0; j < width; ++j) {
+    int16x8_t *buf = buf1 + 32 * i;
+    for (int j = 0; j < 32; ++j) {
       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
     }
-    av1_fdct32_new_neon(bufA, bufA, cos_bit_row, 1, NULL);
-    av1_fdct32_new_neon(bufB, bufB, cos_bit_row, 1, NULL);
-    av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
-    av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
-
-    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+    fdct32_new_neon(bufA, bufA, 11);
+    fdct32_new_neon(bufB, bufB, 11);
+    shift_right_2_round_s32_x4(bufA, bufA, 32);
+    shift_right_2_round_s32_x4(bufB, bufB, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+    round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   }
 }
 
 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
-  av1_lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
-  av1_lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
-  av1_lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
-  av1_lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
-  av1_lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
-  av1_lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
-  av1_lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
-  av1_lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
-  av1_lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
-  av1_lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
-  av1_lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
-  av1_lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
-  av1_lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
-  av1_lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
-  av1_lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
-  av1_lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
-  av1_lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
-  av1_lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
-  av1_lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
+  lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
+  lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
+  lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
+  lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
+  lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
+  lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
+  lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
+  lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
+  lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
+  lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
+  lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
+  lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
+  lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
+  lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
+  lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
+  lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
 };
 
 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/neon/encodetxb_neon.c
index ee93608..582863a 100644
--- a/av1/encoder/arm/neon/encodetxb_neon.c
+++ b/av1/encoder/arm/neon/encodetxb_neon.c
@@ -57,10 +57,7 @@
     } while (i < width);
   } else if (height == 8) {
     do {
-      const int32x4_t coeffA = vld1q_s32(cf);
-      const int32x4_t coeffB = vld1q_s32(cf + 4);
-      const int16x8_t coeffAB =
-          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+      const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
       const int16x8_t absAB = vqabsq_s16(coeffAB);
       const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
           vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
@@ -73,14 +70,8 @@
     do {
       int j = 0;
       do {
-        const int32x4_t coeffA = vld1q_s32(cf);
-        const int32x4_t coeffB = vld1q_s32(cf + 4);
-        const int32x4_t coeffC = vld1q_s32(cf + 8);
-        const int32x4_t coeffD = vld1q_s32(cf + 12);
-        const int16x8_t coeffAB =
-            vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
-        const int16x8_t coeffCD =
-            vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
+        const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+        const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
         const int16x8_t absAB = vqabsq_s16(coeffAB);
         const int16x8_t absCD = vqabsq_s16(coeffCD);
         const uint8x16_t absABCD = vreinterpretq_u8_s8(
@@ -282,7 +273,7 @@
   const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
 
   uint8x16_t pos_to_offset =
-      vld1q_u8((width == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
+      (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
 
   uint8x16_t count;
   uint8x16_t level[5];
diff --git a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
index 15d375a..aa64a38 100644
--- a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
+++ b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -12,2112 +12,1586 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "av1/common/av1_txfm.h"
-#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_dsp/txfm_common.h"
 #include "aom_ports/mem.h"
-#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
 
-static INLINE void store_output_w4(int32_t *const out,
-                                   const int32x4_t *const in, const int stride,
-                                   const int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    vst1q_s32(out + i * stride, in[i]);
+static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
+                                                        int32x4_t *out) {
+  // This is not quite the same as the other transposes defined in
+  // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
+  // unused by the following row transform.
+  for (int j = 0; j < 8; ++j) {
+    for (int i = 0; i < 16; ++i) {
+      transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
+    }
   }
 }
 
-static INLINE int32x4_t half_btf_neon(const int32_t *w0, const int32x4_t *n0,
-                                      const int32_t *w1, const int32x4_t *n1,
-                                      const int32x4_t v_bit) {
-  int32x4_t x;
-  x = vmulq_n_s32(*n0, *w0);
-  x = vmlaq_n_s32(x, *n1, *w1);
-  x = vrshlq_s32(x, v_bit);
-  return x;
+// A note on butterfly helper naming:
+//
+// butterfly_[weight_indices]_neon
+// e.g. butterfly_0312_neon
+//                ^ Weights are applied as indices 0, 3, 2, 1
+//                  (see more detail below)
+//
+// Weight indices are treated as an index into the 4-tuple of the weight
+// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
+// This is then represented in the helper naming by referring to the lane index
+// in the loaded tuple that each multiply is performed with:
+//
+//         in0   in1
+//      /------------
+// out0 |  w[0]  w[1]   ==>  out0 = in0 * w[0] + in1 * w[1]
+// out1 |  w[2]  w[3]   ==>  out1 = in0 * w[2] + in1 * w[3]
+//
+// So for indices 0321 from the earlier example, we end up with:
+//
+//          in0       in1
+//      /------------------
+// out0 | (lane 0) (lane 3)   ==>  out0 = in0 *  w0 + in1 * (w0-1)
+// out1 | (lane 2) (lane 1)   ==>  out1 = in0 * -w0 + in1 * (1-w0)
+
+#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit)   \
+  do {                                                                  \
+    int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } };                   \
+    int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
+    x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2);        \
+    *out = vrshlq_s32(x, v_bit);                                        \
+  } while (false)
+
+static AOM_FORCE_INLINE void butterfly_0112_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
 }
 
-static INLINE int32x4_t half_btf_neon_m(const int32_t *w0, const int32x4_t *n0,
-                                        const int32_t *w1, const int32x4_t *n1,
-                                        const int32x4_t v_bit) {
-  int32x4_t x;
-  x = vmulq_n_s32(*n0, *w0);
-  x = vmlsq_n_s32(x, *n1, *w1);
-  x = vrshlq_s32(x, v_bit);
-  return x;
+static AOM_FORCE_INLINE void butterfly_2312_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
 }
 
-#if AOM_ARCH_AARCH64
-#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
-  do {                                                        \
-    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
-    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
-    y0 = vreinterpretq_s32_s64(                               \
-        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
-                   vreinterpretq_s64_s32(swap_high.val[0]))); \
-    y1 = vreinterpretq_s32_s64(                               \
-        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
-                   vreinterpretq_s64_s32(swap_high.val[1]))); \
-    y2 = vreinterpretq_s32_s64(                               \
-        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
-                   vreinterpretq_s64_s32(swap_high.val[0]))); \
-    y3 = vreinterpretq_s32_s64(                               \
-        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
-                   vreinterpretq_s64_s32(swap_high.val[1]))); \
-  } while (0)
-#else
-#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
-  do {                                                                   \
-    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
-    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
-    y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
-                   swap_high.val[0], 2);                                 \
-    y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
-                   swap_high.val[1], 2);                                 \
-    y2 = vextq_s32(swap_low.val[0],                                      \
-                   vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
-    y3 = vextq_s32(swap_low.val[1],                                      \
-                   vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
-  } while (0)
-#endif  // AOM_ARCH_AARCH64
-
-static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
-  TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
+static AOM_FORCE_INLINE void butterfly_0332_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
 }
 
-static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
-  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
-  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
-  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
-  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
-                out[15]);
+static AOM_FORCE_INLINE void butterfly_0130_neon(
+    const int32_t *cospi, const int widx0, const int32x4_t n0,
+    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+    const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
 }
 
-static INLINE void transpose_16x16(const int32x4_t *in, int32x4_t *out) {
-  // Upper left 8x8
-  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
-  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
-                out[28]);
-  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
-                out[13]);
-  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
-                out[29]);
-
-  // Upper right 8x8
-  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
-                out[44]);
-  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
-                out[60]);
-  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
-                out[45]);
-  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
-                out[61]);
-
-  // Lower left 8x8
-  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
-                out[14]);
-  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
-                out[30]);
-  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
-                out[15]);
-  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
-                out[31]);
-  // Lower right 8x8
-  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
-                out[46]);
-  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
-                out[62]);
-  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
-                out[47]);
-  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
-                out[63]);
+static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
+    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+  butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
 }
 
-static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
-                                                      int32x4_t *output,
-                                                      const int size,
-                                                      const int bit,
-                                                      const int val) {
-  const int32x4_t sqrt2 = vdupq_n_s32(val);
-  const int32x4_t v_bit = vdupq_n_s32(-bit);
-  int i;
-  for (i = 0; i < size; i++) {
-    const int32x4_t r0 = vrshlq_s32(input[i], v_bit);
-    const int32x4_t r1 = vmulq_s32(sqrt2, r0);
+static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
+    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+  butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
+  butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
+                                                       int32x4_t *output,
+                                                       const int size) {
+  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+  int i = 0;
+  do {
+    const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
     output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
-  }
+  } while (++i < size);
 }
 
-#define btf_32_neon_type0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
-  do {                                                             \
-    out0 = vmulq_n_s32(in0, w0);                                   \
-    out0 = vmlaq_n_s32(out0, in1, w1);                             \
-    out0 = vrshlq_s32(out0, v_cos_bit);                            \
-    out1 = vmulq_n_s32(in0, w1);                                   \
-    out1 = vmlsq_n_s32(out1, in1, w0);                             \
-    out1 = vrshlq_s32(out1, v_cos_bit);                            \
-  } while (0)
+static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
+    const int32x4_t *input, int32x4_t *output, const int size) {
+  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+  int i = 0;
+  do {
+    const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
+    const int32x4_t r1 = vmulq_s32(r0, sqrt2);
+    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+  } while (++i < size);
+}
 
-#define btf_32_neon_type1(w0, w1, in0, in1, out0, out1, bit) \
-  do {                                                       \
-    btf_32_neon_type0(w1, w0, in1, in0, out0, out1, bit);    \
-  } while (0)
-
-static INLINE void load_buffer_4x4(const int16_t *input, int32x4_t *in,
-                                   int stride, int flipud, int fliplr,
-                                   const int32x4_t *v_shift) {
-  int16x4_t v0, v1, v2, v3;
-
-  if (!flipud) {
-    v0 = vld1_s16(input + 0 * stride);
-    v1 = vld1_s16(input + 1 * stride);
-    v2 = vld1_s16(input + 2 * stride);
-    v3 = vld1_s16(input + 3 * stride);
-  } else {
-    v0 = vld1_s16(input + 3 * stride);
-    v1 = vld1_s16(input + 2 * stride);
-    v2 = vld1_s16(input + 1 * stride);
-    v3 = vld1_s16(input + 0 * stride);
+#define LOAD_BUFFER_4XH(h)                                           \
+  static AOM_FORCE_INLINE void load_buffer_4x##h(                    \
+      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+    if (fliplr) {                                                    \
+      for (int i = 0; i < (h); ++i) {                                \
+        int16x4_t a = vld1_s16(input + i * stride);                  \
+        a = vrev64_s16(a);                                           \
+        in[i] = vshll_n_s16(a, 2);                                   \
+      }                                                              \
+    } else {                                                         \
+      for (int i = 0; i < (h); ++i) {                                \
+        int16x4_t a = vld1_s16(input + i * stride);                  \
+        in[i] = vshll_n_s16(a, 2);                                   \
+      }                                                              \
+    }                                                                \
   }
 
-  if (fliplr) {
-    v0 = vrev64_s16(v0);
-    v1 = vrev64_s16(v1);
-    v2 = vrev64_s16(v2);
-    v3 = vrev64_s16(v3);
+// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
+// avoid the expression even though the compiler can prove that the code path
+// is never taken if `shift == 0`.
+#define shift_left_long_s16(a, shift) \
+  ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
+
+#define LOAD_BUFFER_WXH(w, h, shift)                                 \
+  static AOM_FORCE_INLINE void load_buffer_##w##x##h(                \
+      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+    assert(w >= 8);                                                  \
+    if (fliplr) {                                                    \
+      for (int i = 0; i < (h); ++i) {                                \
+        for (int j = 0; j < (w) / 8; ++j) {                          \
+          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
+          a = vrev64q_s16(a);                                        \
+          int j2 = (w) / 8 - j - 1;                                  \
+          in[i + (h) * (2 * j2 + 0)] =                               \
+              shift_left_long_s16(vget_high_s16(a), (shift));        \
+          in[i + (h) * (2 * j2 + 1)] =                               \
+              shift_left_long_s16(vget_low_s16(a), (shift));         \
+        }                                                            \
+      }                                                              \
+    } else {                                                         \
+      for (int i = 0; i < (h); ++i) {                                \
+        for (int j = 0; j < (w) / 8; ++j) {                          \
+          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
+          in[i + (h) * (2 * j + 0)] =                                \
+              shift_left_long_s16(vget_low_s16(a), (shift));         \
+          in[i + (h) * (2 * j + 1)] =                                \
+              shift_left_long_s16(vget_high_s16(a), (shift));        \
+        }                                                            \
+      }                                                              \
+    }                                                                \
   }
-  in[0] = vshlq_s32(vmovl_s16(v0), *v_shift);
-  in[1] = vshlq_s32(vmovl_s16(v1), *v_shift);
-  in[2] = vshlq_s32(vmovl_s16(v2), *v_shift);
-  in[3] = vshlq_s32(vmovl_s16(v3), *v_shift);
-}
 
-static void fdct4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
-                         const int num_col) {
-  const int32_t *cospi = cospi_arr(bit);
-  const int32x4_t cospi32 = vdupq_n_s32(cospi[32]);
-  const int32x4_t cospi48 = vdupq_n_s32(cospi[48]);
-  const int32x4_t cospi16 = vdupq_n_s32(cospi[16]);
-  int32x4_t s0, s1, s2, s3;
-  int32x4_t u0, u1, u2, u3;
-  int32x4_t v0, v2;
+LOAD_BUFFER_4XH(4)
+LOAD_BUFFER_4XH(8)
+LOAD_BUFFER_4XH(16)
+LOAD_BUFFER_4XH(32)
+LOAD_BUFFER_WXH(8, 8, 2)
+LOAD_BUFFER_WXH(16, 16, 2)
+LOAD_BUFFER_WXH(32, 64, 0)
+LOAD_BUFFER_WXH(64, 32, 2)
+LOAD_BUFFER_WXH(64, 64, 0)
 
-  int endidx = 3 * num_col;
-  s0 = vaddq_s32(in[0], in[endidx]);
-  s3 = vsubq_s32(in[0], in[endidx]);
-  endidx -= num_col;
-  s1 = vaddq_s32(in[num_col], in[endidx]);
-  s2 = vsubq_s32(in[num_col], in[endidx]);
+#if !CONFIG_REALTIME_ONLY
+LOAD_BUFFER_WXH(16, 64, 0)
+LOAD_BUFFER_WXH(64, 16, 2)
+#endif  // !CONFIG_REALTIME_ONLY
 
-  u0 = vmulq_s32(s0, cospi32);
-  u1 = vmulq_s32(s1, cospi32);
-  u2 = vaddq_s32(u0, u1);
-  v0 = vsubq_s32(u0, u1);
-  const int32x4_t v_bit = vdupq_n_s32(-bit);
-  u0 = vrshlq_s32(u2, v_bit);
-  u2 = vrshlq_s32(v0, v_bit);
+#define STORE_BUFFER_WXH(w, h)                                \
+  static AOM_FORCE_INLINE void store_buffer_##w##x##h(        \
+      const int32x4_t *in, int32_t *out, int stride) {        \
+    for (int i = 0; i < (w); ++i) {                           \
+      for (int j = 0; j < (h) / 4; ++j) {                     \
+        vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
+      }                                                       \
+    }                                                         \
+  }
 
-  v0 = vmulq_s32(s2, cospi48);
-  v2 = vmlaq_s32(v0, s3, cospi16);
+STORE_BUFFER_WXH(4, 4)
+STORE_BUFFER_WXH(8, 4)
+STORE_BUFFER_WXH(8, 8)
+STORE_BUFFER_WXH(16, 4)
+STORE_BUFFER_WXH(16, 16)
+STORE_BUFFER_WXH(32, 4)
+STORE_BUFFER_WXH(32, 32)
+STORE_BUFFER_WXH(64, 32)
 
-  u1 = vrshlq_s32(v2, v_bit);
+#if !CONFIG_REALTIME_ONLY
+STORE_BUFFER_WXH(16, 32)
+STORE_BUFFER_WXH(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
-  v0 = vmulq_s32(s3, cospi48);
-  v2 = vmlsq_s32(v0, s2, cospi16);
+static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
+                                                  int32x4_t *out, int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
+  const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
 
-  u3 = vrshlq_s32(v2, v_bit);
+  const int32x4_t a0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t a1 = vsubq_s32(in[0], in[3]);
+  const int32x4_t a2 = vaddq_s32(in[1], in[2]);
+  const int32x4_t a3 = vsubq_s32(in[1], in[2]);
 
-  out[0] = u0;
-  out[1] = u1;
-  out[2] = u2;
-  out[3] = u3;
-}
+  const int32x4_t b0 = vmulq_s32(a0, cospi32);
+  const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
+  const int32x4_t b2 = vmulq_s32(a2, cospi32);
+  const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
 
-static INLINE void write_buffer_4x4(int32x4_t *res, int32_t *output) {
-  vst1q_s32((output + 0 * 4), res[0]);
-  vst1q_s32((output + 1 * 4), res[1]);
-  vst1q_s32((output + 2 * 4), res[2]);
-  vst1q_s32((output + 3 * 4), res[3]);
-}
-
-static void fadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
-                          const int num_col) {
-  const int32_t *sinpi = sinpi_arr(bit);
-  const int32x4_t sinpi4x = vld1q_s32(&sinpi[1]);
-
-  const int32x4_t sinpi1 = vdupq_lane_s32(vget_low_s32(sinpi4x), 0);
-  const int32x4_t sinpi2 = vdupq_lane_s32(vget_low_s32(sinpi4x), 1);
-  const int32x4_t sinpi3 = vdupq_lane_s32(vget_high_s32(sinpi4x), 0);
-  const int32x4_t sinpi4 = vdupq_lane_s32(vget_high_s32(sinpi4x), 1);
-  int32x4_t t;
-  int32x4_t s0, s1, s2, s3, s7;
-  int32x4_t x0, x1, x2, x3;
-
-  int idx = 0 * num_col;
-  s0 = vmulq_s32(in[idx], sinpi1);
-  s1 = vmulq_s32(in[idx], sinpi4);
-  t = vaddq_s32(in[idx], in[idx + num_col]);
-  idx += 2 * num_col;
-  x3 = vmulq_s32(in[idx], sinpi3);
-  idx += num_col;
-  s7 = vsubq_s32(t, in[idx]);
-
-  t = vmlaq_s32(s0, in[idx - 2 * num_col], sinpi2);
-  x0 = vmlaq_s32(t, in[idx], sinpi4);
-  x1 = vmulq_s32(s7, sinpi3);
-  t = vmlsq_s32(s1, in[idx - 2 * num_col], sinpi1);
-  x2 = vmlaq_s32(t, in[idx], sinpi2);
-
-  s0 = vaddq_s32(x0, x3);
-  s1 = x1;
-  s2 = vsubq_s32(x2, x3);
-  t = vsubq_s32(x2, x0);
-  s3 = vaddq_s32(t, x3);
+  const int32x4_t c0 = vaddq_s32(b0, b2);
+  const int32x4_t c1 = vsubq_s32(b0, b2);
+  const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
+  const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
 
   const int32x4_t v_bit = vdupq_n_s32(-bit);
-  out[0] = vrshlq_s32(s0, v_bit);
-  out[1] = vrshlq_s32(s1, v_bit);
-  out[2] = vrshlq_s32(s2, v_bit);
-  out[3] = vrshlq_s32(s3, v_bit);
+  const int32x4_t d0 = vrshlq_s32(c0, v_bit);
+  const int32x4_t d1 = vrshlq_s32(c1, v_bit);
+  const int32x4_t d2 = vrshlq_s32(c2, v_bit);
+  const int32x4_t d3 = vrshlq_s32(c3, v_bit);
+
+  out[0] = d0;
+  out[1] = d2;
+  out[2] = d1;
+  out[3] = d3;
 }
-static void idtx4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+
+static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
+                                                   int32x4_t *out, int bit) {
+  const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
+
+  const int32x4_t a0 = vaddq_s32(in[0], in[1]);
+  const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
+  const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
+  const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
+
+  const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
+  const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
+  const int32x4_t b2 = vsubq_s32(a0, in[3]);
+
+  const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
+  const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
+  const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
+
+  const int32x4_t d0 = vaddq_s32(c0, a3);
+  const int32x4_t d1 = vsubq_s32(c1, a3);
+  const int32x4_t d2 = vsubq_s32(c1, c0);
+
+  const int32x4_t e0 = vaddq_s32(d2, a3);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  out[0] = vrshlq_s32(d0, v_bit);
+  out[1] = vrshlq_s32(c2, v_bit);
+  out[2] = vrshlq_s32(d1, v_bit);
+  out[3] = vrshlq_s32(e0, v_bit);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
+                                                       int32x4_t *out,
+                                                       int bit) {
   (void)bit;
   int32x4_t fact = vdupq_n_s32(NewSqrt2);
-  int32x4_t a_low;
 
-  int i;
-  for (i = 0; i < 4; i++) {
-    a_low = vmulq_s32(in[i * col_num], fact);
+  for (int i = 0; i < 4; i++) {
+    const int32x4_t a_low = vmulq_s32(in[i], fact);
     out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
   }
 }
+
 void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
                              int input_stride, TX_TYPE tx_type, int bd) {
-  int32x4_t in[4];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
-  const int txw_idx = get_txw_idx(TX_4X4);
-  const int txh_idx = get_txh_idx(TX_4X4);
-  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  (void)bd;
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
+
+  // Workspace for column/row-wise transforms.
+  int32x4_t buf[4];
+
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case ADST_DCT:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case DCT_ADST:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case ADST_ADST:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case FLIPADST_DCT:
-      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case DCT_FLIPADST:
-      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case FLIPADST_FLIPADST:
-      load_buffer_4x4(input, in, input_stride, 1, 1, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case ADST_FLIPADST:
-      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case FLIPADST_ADST:
-      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case IDTX:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case V_DCT:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case H_DCT:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case V_ADST:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case H_ADST:
-      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case V_FLIPADST:
-      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 0);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     case H_FLIPADST:
-      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
-      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      transpose_4x4(in, in);
-      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
-      write_buffer_4x4(in, coeff);
+      load_buffer_4x4(input, buf, input_stride, 1);
+      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      transpose_arrays_s32_4x4(buf, buf);
+      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+      store_buffer_4x4(buf, coeff, /*stride=*/4);
       break;
     default: assert(0);
   }
-  (void)bd;
 }
 
-static INLINE void load_buffer_8x8(const int16_t *input, int32x4_t *in,
-                                   int stride, int flipud, int fliplr,
-                                   const int shift) {
-  if (!flipud) {
-    in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
-    in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
-    in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
-    in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
-    in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
-    in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
-    in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
-    in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
-  } else {
-    in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
-    in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
-    in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
-    in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
-    in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
-    in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
-    in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
-    in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
+// Butterfly pre-processing:
+// e.g. n=4:
+//   out[0] = in[0] + in[3]
+//   out[1] = in[1] + in[2]
+//   out[2] = in[1] - in[2]
+//   out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
+                                               int32x4_t *output, int n) {
+  for (int i = 0; i < n / 2; ++i) {
+    output[i] = vaddq_s32(input[i], input[n - i - 1]);
   }
-
-  if (fliplr) {
-    in[0] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[0])));
-    in[0] = vextq_s32(in[0], in[0], 2);
-    in[1] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[1])));
-    in[1] = vextq_s32(in[1], in[1], 2);
-    in[2] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[2])));
-    in[2] = vextq_s32(in[2], in[2], 2);
-    in[3] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[3])));
-    in[3] = vextq_s32(in[3], in[3], 2);
-    in[4] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[4])));
-    in[4] = vextq_s32(in[4], in[4], 2);
-    in[5] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[5])));
-    in[5] = vextq_s32(in[5], in[5], 2);
-    in[6] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[6])));
-    in[6] = vextq_s32(in[6], in[6], 2);
-    in[7] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[7])));
-    in[7] = vextq_s32(in[7], in[7], 2);
+  for (int i = 0; i < n / 2; ++i) {
+    output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
   }
-
-  int16x4_t u = vget_high_s16(vreinterpretq_s16_s32(in[4]));
-  in[8] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[4])));
-  in[9] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[5]));
-  in[10] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[5])));
-  in[11] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[6]));
-  in[12] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[6])));
-  in[13] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[7]));
-  in[14] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[7])));
-  in[15] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[3]));
-  in[6] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[3])));
-  in[7] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[2]));
-  in[4] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[2])));
-  in[5] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[1]));
-  in[2] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[1])));
-  in[3] = vmovl_s16(u);
-
-  u = vget_high_s16(vreinterpretq_s16_s32(in[0]));
-  in[0] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[0])));
-  in[1] = vmovl_s16(u);
-
-  const int32x4_t v_shift = vdupq_n_s32(shift);
-
-  in[0] = vshlq_s32(in[0], v_shift);
-  in[1] = vshlq_s32(in[1], v_shift);
-  in[2] = vshlq_s32(in[2], v_shift);
-  in[3] = vshlq_s32(in[3], v_shift);
-  in[4] = vshlq_s32(in[4], v_shift);
-  in[5] = vshlq_s32(in[5], v_shift);
-  in[6] = vshlq_s32(in[6], v_shift);
-  in[7] = vshlq_s32(in[7], v_shift);
-
-  in[8] = vshlq_s32(in[8], v_shift);
-  in[9] = vshlq_s32(in[9], v_shift);
-  in[10] = vshlq_s32(in[10], v_shift);
-  in[11] = vshlq_s32(in[11], v_shift);
-  in[12] = vshlq_s32(in[12], v_shift);
-  in[13] = vshlq_s32(in[13], v_shift);
-  in[14] = vshlq_s32(in[14], v_shift);
-  in[15] = vshlq_s32(in[15], v_shift);
 }
 
-static INLINE void col_txfm_8x8_rounding(int32x4_t *in,
-                                         const int32x4_t *v_shift) {
-  in[0] = vrshlq_s32(in[0], *v_shift);
-  in[1] = vrshlq_s32(in[1], *v_shift);
-  in[2] = vrshlq_s32(in[2], *v_shift);
-  in[3] = vrshlq_s32(in[3], *v_shift);
-  in[4] = vrshlq_s32(in[4], *v_shift);
-  in[5] = vrshlq_s32(in[5], *v_shift);
-  in[6] = vrshlq_s32(in[6], *v_shift);
-  in[7] = vrshlq_s32(in[7], *v_shift);
-  in[8] = vrshlq_s32(in[8], *v_shift);
-  in[9] = vrshlq_s32(in[9], *v_shift);
-  in[10] = vrshlq_s32(in[10], *v_shift);
-  in[11] = vrshlq_s32(in[11], *v_shift);
-  in[12] = vrshlq_s32(in[12], *v_shift);
-  in[13] = vrshlq_s32(in[13], *v_shift);
-  in[14] = vrshlq_s32(in[14], *v_shift);
-  in[15] = vrshlq_s32(in[15], *v_shift);
+// Butterfly post-processing:
+// e.g. n=8:
+//   out[0] = in0[0] + in1[3];
+//   out[1] = in0[1] + in1[2];
+//   out[2] = in0[1] - in1[2];
+//   out[3] = in0[0] - in1[3];
+//   out[4] = in0[7] - in1[4];
+//   out[5] = in0[6] - in1[5];
+//   out[6] = in0[6] + in1[5];
+//   out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
+                                                const int32x4_t *in1,
+                                                int32x4_t *output, int n) {
+  for (int i = 0; i < n / 4; ++i) {
+    output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+  }
+  for (int i = 0; i < n / 4; ++i) {
+    output[(3 * n) / 4 + i] =
+        vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+  }
 }
 
-static INLINE void col_txfm_4x8_rounding(int32x4_t *in,
-                                         const int32x4_t *v_shift) {
-  in[0] = vrshlq_s32(in[0], *v_shift);
-  in[1] = vrshlq_s32(in[1], *v_shift);
-  in[2] = vrshlq_s32(in[2], *v_shift);
-  in[3] = vrshlq_s32(in[3], *v_shift);
-  in[4] = vrshlq_s32(in[4], *v_shift);
-  in[5] = vrshlq_s32(in[5], *v_shift);
-  in[6] = vrshlq_s32(in[6], *v_shift);
-  in[7] = vrshlq_s32(in[7], *v_shift);
-}
-
-static INLINE void write_buffer_8x8(const int32x4_t *res, int32_t *output) {
-  vst1q_s32(output + 0 * 4, res[0]);
-  vst1q_s32(output + 1 * 4, res[1]);
-  vst1q_s32(output + 2 * 4, res[2]);
-  vst1q_s32(output + 3 * 4, res[3]);
-
-  vst1q_s32(output + 4 * 4, res[4]);
-  vst1q_s32(output + 5 * 4, res[5]);
-  vst1q_s32(output + 6 * 4, res[6]);
-  vst1q_s32(output + 7 * 4, res[7]);
-
-  vst1q_s32(output + 8 * 4, res[8]);
-  vst1q_s32(output + 9 * 4, res[9]);
-  vst1q_s32(output + 10 * 4, res[10]);
-  vst1q_s32(output + 11 * 4, res[11]);
-
-  vst1q_s32(output + 12 * 4, res[12]);
-  vst1q_s32(output + 13 * 4, res[13]);
-  vst1q_s32(output + 14 * 4, res[14]);
-  vst1q_s32(output + 15 * 4, res[15]);
-}
-
-static INLINE void write_buffer_16x8(const int32x4_t *res, int32_t *output,
-                                     const int stride) {
-  vst1q_s32(output, res[0]);
-  vst1q_s32(output + 4, res[1]);
-  vst1q_s32(output + stride, res[2]);
-  vst1q_s32(output + stride + 4, res[3]);
-
-  vst1q_s32(output + (stride * 2), res[4]);
-  vst1q_s32(output + (stride * 2) + 4, res[5]);
-  vst1q_s32(output + (stride * 3), res[6]);
-  vst1q_s32(output + (stride * 3) + 4, res[7]);
-
-  vst1q_s32(output + (stride * 4), res[8]);
-  vst1q_s32(output + (stride * 4) + 4, res[9]);
-  vst1q_s32(output + (stride * 5), res[10]);
-  vst1q_s32(output + (stride * 5) + 4, res[11]);
-
-  vst1q_s32(output + (stride * 6), res[12]);
-  vst1q_s32(output + (stride * 6) + 4, res[13]);
-  vst1q_s32(output + (stride * 7), res[14]);
-  vst1q_s32(output + (stride * 7) + 4, res[15]);
-}
-
-static void fdct4x8_neon(int32x4_t *in, int32x4_t *out, int bit,
-                         const int col_num) {
-  const int32_t *cospi = cospi_arr(bit);
+static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
+                                                  int32x4_t *out, int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
   const int32x4_t v_bit = vdupq_n_s32(-bit);
-  int32x4_t u[8], v[8];
 
-  int startidx = 0 * col_num;
-  int endidx = 7 * col_num;
-  // stage 0-1
-  u[0] = vaddq_s32(in[startidx], in[endidx]);
-  v[7] = vsubq_s32(in[startidx], in[endidx]);
-  startidx += col_num;
-  endidx -= col_num;
-  u[1] = vaddq_s32(in[startidx], in[endidx]);
-  u[6] = vsubq_s32(in[startidx], in[endidx]);
-  startidx += col_num;
-  endidx -= col_num;
-  u[2] = vaddq_s32(in[startidx], in[endidx]);
-  u[5] = vsubq_s32(in[startidx], in[endidx]);
-  startidx += col_num;
-  endidx -= col_num;
-  u[3] = vaddq_s32(in[startidx], in[endidx]);
-  v[4] = vsubq_s32(in[startidx], in[endidx]);
+  // stage 1
+  int32x4_t a[8];
+  butterfly_dct_pre(in, a, 8);
 
   // stage 2
-  v[0] = vaddq_s32(u[0], u[3]);
-  v[3] = vsubq_s32(u[0], u[3]);
-  v[1] = vaddq_s32(u[1], u[2]);
-  v[2] = vsubq_s32(u[1], u[2]);
-
-  v[5] = vmulq_n_s32(u[6], cospi[32]);
-  v[5] = vmlsq_n_s32(v[5], u[5], cospi[32]);
-  v[5] = vrshlq_s32(v[5], v_bit);
-
-  u[0] = vmulq_n_s32(u[5], cospi[32]);
-  v[6] = vmlaq_n_s32(u[0], u[6], cospi[32]);
-  v[6] = vrshlq_s32(v[6], v_bit);
+  int32x4_t b[8];
+  butterfly_dct_pre(a, b, 4);
+  butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
 
   // stage 3
-  // type 0
-  v[0] = vmulq_n_s32(v[0], cospi[32]);
-  v[1] = vmulq_n_s32(v[1], cospi[32]);
-  u[0] = vaddq_s32(v[0], v[1]);
-  u[0] = vrshlq_s32(u[0], v_bit);
-
-  u[1] = vsubq_s32(v[0], v[1]);
-  u[1] = vrshlq_s32(u[1], v_bit);
-
-  // type 1
-  v[0] = vmulq_n_s32(v[2], cospi[48]);
-  u[2] = vmlaq_n_s32(v[0], v[3], cospi[16]);
-  u[2] = vrshlq_s32(u[2], v_bit);
-
-  v[1] = vmulq_n_s32(v[3], cospi[48]);
-  u[3] = vmlsq_n_s32(v[1], v[2], cospi[16]);
-  u[3] = vrshlq_s32(u[3], v_bit);
-
-  u[4] = vaddq_s32(v[4], v[5]);
-  u[5] = vsubq_s32(v[4], v[5]);
-  u[6] = vsubq_s32(v[7], v[6]);
-  u[7] = vaddq_s32(v[7], v[6]);
+  int32x4_t c[8];
+  butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
+  butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
+  butterfly_dct_post(a + 4, b + 4, c + 4, 4);
 
   // stage 4-5
-  v[0] = vmulq_n_s32(u[4], cospi[56]);
-  v[0] = vmlaq_n_s32(v[0], u[7], cospi[8]);
-  out[1 * col_num] = vrshlq_s32(v[0], v_bit);
+  butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
+  butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
 
-  v[1] = vmulq_n_s32(u[7], cospi[56]);
-  v[0] = vmlsq_n_s32(v[1], u[4], cospi[8]);
-  out[7 * col_num] = vrshlq_s32(v[0], v_bit);
-
-  v[0] = vmulq_n_s32(u[5], cospi[24]);
-  v[0] = vmlaq_n_s32(v[0], u[6], cospi[40]);
-  out[5 * col_num] = vrshlq_s32(v[0], v_bit);
-
-  v[1] = vmulq_n_s32(u[6], cospi[24]);
-  v[0] = vmlsq_n_s32(v[1], u[5], cospi[40]);
-  out[3 * col_num] = vrshlq_s32(v[0], v_bit);
-
-  out[0 * col_num] = u[0];
-  out[4 * col_num] = u[1];
-  out[2 * col_num] = u[2];
-  out[6 * col_num] = u[3];
+  out[0] = c[0];
+  out[2] = c[2];
+  out[4] = c[1];
+  out[6] = c[3];
 }
 
-static void fdct8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
-                         const int col_num) {
-  fdct4x8_neon(in, out, bit, col_num);
-  fdct4x8_neon(in + 1, out + 1, bit, col_num);
-}
-
-static void fadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
-                          const int col_num) {
-  const int32_t *cospi = cospi_arr(bit);
-
+static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
+                                                   int32x4_t *out, int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
   const int32x4_t v_bit = vdupq_n_s32(-bit);
+
   int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
   int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
-  int32x4_t x, y;
-  int col;
 
-  for (col = 0; col < col_num; ++col) {
-    // stage 0-1
-    u0 = in[col_num * 0 + col];
-    u1 = vnegq_s32(in[col_num * 7 + col]);
-    u2 = vnegq_s32(in[col_num * 3 + col]);
-    u3 = in[col_num * 4 + col];
-    u4 = vnegq_s32(in[col_num * 1 + col]);
-    u5 = in[col_num * 6 + col];
-    u6 = in[col_num * 2 + col];
-    u7 = vnegq_s32(in[col_num * 5 + col]);
-
-    // stage 2
-    v0 = u0;
-    v1 = u1;
-
-    x = vmulq_n_s32(u2, cospi[32]);
-    y = vmulq_n_s32(u3, cospi[32]);
-    v2 = vaddq_s32(x, y);
-    v2 = vrshlq_s32(v2, v_bit);
-
-    v3 = vsubq_s32(x, y);
-    v3 = vrshlq_s32(v3, v_bit);
-
-    v4 = u4;
-    v5 = u5;
-
-    x = vmulq_n_s32(u6, cospi[32]);
-    y = vmulq_n_s32(u7, cospi[32]);
-    v6 = vaddq_s32(x, y);
-    v6 = vrshlq_s32(v6, v_bit);
-
-    v7 = vsubq_s32(x, y);
-    v7 = vrshlq_s32(v7, v_bit);
-
-    // stage 3
-    u0 = vaddq_s32(v0, v2);
-    u1 = vaddq_s32(v1, v3);
-    u2 = vsubq_s32(v0, v2);
-    u3 = vsubq_s32(v1, v3);
-    u4 = vaddq_s32(v4, v6);
-    u5 = vaddq_s32(v5, v7);
-    u6 = vsubq_s32(v4, v6);
-    u7 = vsubq_s32(v5, v7);
-
-    // stage 4
-    v0 = u0;
-    v1 = u1;
-    v2 = u2;
-    v3 = u3;
-
-    v4 = vmulq_n_s32(u4, cospi[16]);
-    v4 = vmlaq_n_s32(v4, u5, cospi[48]);
-    v4 = vrshlq_s32(v4, v_bit);
-
-    v5 = vmulq_n_s32(u4, cospi[48]);
-    v5 = vmlsq_n_s32(v5, u5, cospi[16]);
-    v5 = vrshlq_s32(v5, v_bit);
-
-    v6 = vmulq_n_s32(u7, cospi[16]);
-    v6 = vmlsq_n_s32(v6, u6, cospi[48]);
-    v6 = vrshlq_s32(v6, v_bit);
-
-    v7 = vmulq_n_s32(u6, cospi[16]);
-    v7 = vmlaq_n_s32(v7, u7, cospi[48]);
-    v7 = vrshlq_s32(v7, v_bit);
-
-    // stage 5
-    u0 = vaddq_s32(v0, v4);
-    u1 = vaddq_s32(v1, v5);
-    u2 = vaddq_s32(v2, v6);
-    u3 = vaddq_s32(v3, v7);
-    u4 = vsubq_s32(v0, v4);
-    u5 = vsubq_s32(v1, v5);
-    u6 = vsubq_s32(v2, v6);
-    u7 = vsubq_s32(v3, v7);
-
-    // stage 6
-    v0 = vmulq_n_s32(u0, cospi[4]);
-    v0 = vmlaq_n_s32(v0, u1, cospi[60]);
-    v0 = vrshlq_s32(v0, v_bit);
-
-    v1 = vmulq_n_s32(u0, cospi[60]);
-    v1 = vmlsq_n_s32(v1, u1, cospi[4]);
-    v1 = vrshlq_s32(v1, v_bit);
-
-    v2 = vmulq_n_s32(u2, cospi[20]);
-    v2 = vmlaq_n_s32(v2, u3, cospi[44]);
-    v2 = vrshlq_s32(v2, v_bit);
-
-    v3 = vmulq_n_s32(u2, cospi[44]);
-    v3 = vmlsq_n_s32(v3, u3, cospi[20]);
-    v3 = vrshlq_s32(v3, v_bit);
-
-    v4 = vmulq_n_s32(u4, cospi[36]);
-    v4 = vmlaq_n_s32(v4, u5, cospi[28]);
-    v4 = vrshlq_s32(v4, v_bit);
-
-    v5 = vmulq_n_s32(u4, cospi[28]);
-    v5 = vmlsq_n_s32(v5, u5, cospi[36]);
-    v5 = vrshlq_s32(v5, v_bit);
-
-    x = vmulq_n_s32(u6, cospi[52]);
-    v6 = vmlaq_n_s32(x, u7, cospi[12]);
-    v6 = vrshlq_s32(v6, v_bit);
-
-    v7 = vmulq_n_s32(u6, cospi[12]);
-    v7 = vmlsq_n_s32(v7, u7, cospi[52]);
-    v7 = vrshlq_s32(v7, v_bit);
-
-    // stage 7
-    out[col_num * 0 + col] = v1;
-    out[col_num * 1 + col] = v6;
-    out[col_num * 2 + col] = v3;
-    out[col_num * 3 + col] = v4;
-    out[col_num * 4 + col] = v5;
-    out[col_num * 5 + col] = v2;
-    out[col_num * 6 + col] = v7;
-    out[col_num * 7 + col] = v0;
-  }
-}
-static void idtx8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
-  (void)bit;
-
-  for (int i = 0; i < col_num; i += 1) {
-    out[0 + 8 * i] = vshlq_n_s32(in[0 + 8 * i], 1);
-    out[1 + 8 * i] = vshlq_n_s32(in[1 + 8 * i], 1);
-    out[2 + 8 * i] = vshlq_n_s32(in[2 + 8 * i], 1);
-    out[3 + 8 * i] = vshlq_n_s32(in[3 + 8 * i], 1);
-    out[4 + 8 * i] = vshlq_n_s32(in[4 + 8 * i], 1);
-    out[5 + 8 * i] = vshlq_n_s32(in[5 + 8 * i], 1);
-    out[6 + 8 * i] = vshlq_n_s32(in[6 + 8 * i], 1);
-    out[7 + 8 * i] = vshlq_n_s32(in[7 + 8 * i], 1);
-  }
-}
-#if !CONFIG_REALTIME_ONLY
-static void idtx32x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
-  (void)bit;
-  (void)col_num;
-  for (int j = 0; j < 2; j++) {
-    out[j + 8 * 0] = vshlq_n_s32(in[j + 8 * 0], 1);
-    out[j + 8 * 1] = vshlq_n_s32(in[j + 8 * 1], 1);
-    out[j + 8 * 2] = vshlq_n_s32(in[j + 8 * 2], 1);
-    out[j + 8 * 3] = vshlq_n_s32(in[j + 8 * 3], 1);
-    out[j + 8 * 4] = vshlq_n_s32(in[j + 8 * 4], 1);
-    out[j + 8 * 5] = vshlq_n_s32(in[j + 8 * 5], 1);
-    out[j + 8 * 6] = vshlq_n_s32(in[j + 8 * 6], 1);
-    out[j + 8 * 7] = vshlq_n_s32(in[j + 8 * 7], 1);
-  }
-}
-#endif
-void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
-                             TX_TYPE tx_type, int bd) {
-  int32x4_t in[16], out[16];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
-  const int txw_idx = get_txw_idx(TX_8X8);
-  const int txh_idx = get_txh_idx(TX_8X8);
-  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case ADST_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case DCT_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case ADST_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case FLIPADST_DCT:
-      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case IDTX:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case V_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case H_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case V_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case H_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      col_txfm_8x8_rounding(out, &v_shift1);
-      transpose_8x8(out, in);
-      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
-      write_buffer_8x8(out, coeff);
-      break;
-    default: assert(0);
-  }
-  (void)bd;
-}
-
-// Hybrid Transform 16x16
-
-static INLINE void convert_8x8_to_16x16(const int32x4_t *in, int32x4_t *out) {
-  int row_index = 0;
-  int dst_index = 0;
-  int src_index = 0;
-
-  // row 0, 1, .., 7
-  do {
-    out[dst_index] = in[src_index];
-    out[dst_index + 1] = in[src_index + 1];
-    out[dst_index + 2] = in[src_index + 16];
-    out[dst_index + 3] = in[src_index + 17];
-    dst_index += 4;
-    src_index += 2;
-    row_index += 1;
-  } while (row_index < 8);
-
-  // row 8, 9, ..., 15
-  src_index += 16;
-  do {
-    out[dst_index] = in[src_index];
-    out[dst_index + 1] = in[src_index + 1];
-    out[dst_index + 2] = in[src_index + 16];
-    out[dst_index + 3] = in[src_index + 17];
-    dst_index += 4;
-    src_index += 2;
-    row_index += 1;
-  } while (row_index < 16);
-}
-
-static INLINE void load_buffer_16x16(const int16_t *input, int32x4_t *out,
-                                     int stride, int flipud, int fliplr,
-                                     int shift) {
-  int32x4_t in[64];
-  // Load 4 8x8 blocks
-  const int16_t *topL = input;
-  const int16_t *topR = input + 8;
-  const int16_t *botL = input + 8 * stride;
-  const int16_t *botR = input + 8 * stride + 8;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    // Swap left columns
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-    // Swap right columns
-    tmp = topR;
-    topR = botR;
-    botR = tmp;
-  }
-
-  if (fliplr) {
-    // Swap top rows
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-    // Swap bottom rows
-    tmp = botL;
-    botL = botR;
-    botR = tmp;
-  }
-
-  // load first 8 columns
-  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
-  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
-
-  // load second 8 columns
-  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
-  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
-
-  convert_8x8_to_16x16(in, out);
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, int32x4_t *out,
-                                    int stride, int flipud, int fliplr,
-                                    int shift) {
-  const int16_t *topL = input;
-  const int16_t *botL = input + 8 * stride;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-  }
-
-  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
-  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
-}
-
-static INLINE void load_buffer_8x4(const int16_t *input, int32x4_t *out,
-                                   int stride, int flipud, int fliplr,
-                                   const int32x4_t *v_shift) {
-  const int16_t *topL = input;
-  const int16_t *topR = input + 4;
-
-  const int16_t *tmp;
-
-  if (fliplr) {
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-  }
-  load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
-  load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, v_shift);
-}
-
-static INLINE void load_buffer_16x4(const int16_t *input, int32x4_t *out,
-                                    int stride, int flipud, int fliplr,
-                                    const int32x4_t *v_shift) {
-  const int16_t *topL = input;
-  const int16_t *topR = input + 8;
-
-  const int16_t *tmp;
-
-  if (fliplr) {
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-  }
-
-  load_buffer_8x4(topL, out, stride, flipud, fliplr, v_shift);
-  load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, v_shift);
-}
-
-static INLINE void load_buffer_4x8(const int16_t *input, int32x4_t *out,
-                                   int stride, int flipud, int fliplr,
-                                   const int32x4_t *v_shift) {
-  const int16_t *topL = input;
-  const int16_t *botL = input + 4 * stride;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-  }
-
-  load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
-  load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, v_shift);
-}
-
-#if !CONFIG_REALTIME_ONLY
-static INLINE void load_buffer_4x16(const int16_t *input, int32x4_t *out,
-                                    const int stride, const int flipud,
-                                    const int fliplr,
-                                    const int32x4_t *v_shift) {
-  const int16_t *topL = input;
-  const int16_t *botL = input + 8 * stride;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-  }
-  load_buffer_4x8(topL, out, stride, flipud, fliplr, v_shift);
-  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, v_shift);
-}
-#endif
-
-static INLINE void load_buffer_32x8n(const int16_t *input, int32x4_t *out,
-                                     int stride, int flipud, int fliplr,
-                                     int shift, const int height) {
-  const int16_t *in = input;
-  int32x4_t *output = out;
-  for (int col = 0; col < height; col++) {
-    in = input + col * stride;
-    output = out + col * 8;
-    int32x4_t v_shift = vdupq_n_s32(shift);
-    load_buffer_4x4(in, output, 4, flipud, fliplr, &v_shift);
-    load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, &v_shift);
-  }
-}
-
-static void fdct16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
-                           const int col_num) {
-  const int32_t *cospi = cospi_arr(bit);
-  const int32x4_t v_bit = vdupq_n_s32(-bit);
-  int32x4_t u[16], v[16];
-  int col;
-
-  // Calculate the column 0, 1, 2, 3
-  for (col = 0; col < col_num; ++col) {
-    // stage 0
-    // stage 1
-    u[0] = vaddq_s32(in[0 * col_num + col], in[15 * col_num + col]);
-    u[15] = vsubq_s32(in[0 * col_num + col], in[15 * col_num + col]);
-    u[1] = vaddq_s32(in[1 * col_num + col], in[14 * col_num + col]);
-    u[14] = vsubq_s32(in[1 * col_num + col], in[14 * col_num + col]);
-    u[2] = vaddq_s32(in[2 * col_num + col], in[13 * col_num + col]);
-    u[13] = vsubq_s32(in[2 * col_num + col], in[13 * col_num + col]);
-    u[3] = vaddq_s32(in[3 * col_num + col], in[12 * col_num + col]);
-    u[12] = vsubq_s32(in[3 * col_num + col], in[12 * col_num + col]);
-    u[4] = vaddq_s32(in[4 * col_num + col], in[11 * col_num + col]);
-    u[11] = vsubq_s32(in[4 * col_num + col], in[11 * col_num + col]);
-    u[5] = vaddq_s32(in[5 * col_num + col], in[10 * col_num + col]);
-    u[10] = vsubq_s32(in[5 * col_num + col], in[10 * col_num + col]);
-    u[6] = vaddq_s32(in[6 * col_num + col], in[9 * col_num + col]);
-    u[9] = vsubq_s32(in[6 * col_num + col], in[9 * col_num + col]);
-    u[7] = vaddq_s32(in[7 * col_num + col], in[8 * col_num + col]);
-    u[8] = vsubq_s32(in[7 * col_num + col], in[8 * col_num + col]);
-
-    // stage 2
-    v[0] = vaddq_s32(u[0], u[7]);
-    v[7] = vsubq_s32(u[0], u[7]);
-    v[1] = vaddq_s32(u[1], u[6]);
-    v[6] = vsubq_s32(u[1], u[6]);
-    v[2] = vaddq_s32(u[2], u[5]);
-    v[5] = vsubq_s32(u[2], u[5]);
-    v[3] = vaddq_s32(u[3], u[4]);
-    v[4] = vsubq_s32(u[3], u[4]);
-    v[8] = u[8];
-    v[9] = u[9];
-
-    v[10] = vmulq_n_s32(u[13], cospi[32]);
-    v[10] = vmlsq_n_s32(v[10], u[10], cospi[32]);
-    v[10] = vrshlq_s32(v[10], v_bit);
-
-    v[13] = vmulq_n_s32(u[10], cospi[32]);
-    v[13] = vmlaq_n_s32(v[13], u[13], cospi[32]);
-    v[13] = vrshlq_s32(v[13], v_bit);
-
-    v[11] = vmulq_n_s32(u[12], cospi[32]);
-    v[11] = vmlsq_n_s32(v[11], u[11], cospi[32]);
-    v[11] = vrshlq_s32(v[11], v_bit);
-
-    v[12] = vmulq_n_s32(u[11], cospi[32]);
-    v[12] = vmlaq_n_s32(v[12], u[12], cospi[32]);
-    v[12] = vrshlq_s32(v[12], v_bit);
-    v[14] = u[14];
-    v[15] = u[15];
-
-    // stage 3
-    u[0] = vaddq_s32(v[0], v[3]);
-    u[3] = vsubq_s32(v[0], v[3]);
-    u[1] = vaddq_s32(v[1], v[2]);
-    u[2] = vsubq_s32(v[1], v[2]);
-    u[4] = v[4];
-
-    u[5] = vmulq_n_s32(v[6], cospi[32]);
-    u[5] = vmlsq_n_s32(u[5], v[5], cospi[32]);
-    u[5] = vrshlq_s32(u[5], v_bit);
-
-    u[6] = vmulq_n_s32(v[5], cospi[32]);
-    u[6] = vmlaq_n_s32(u[6], v[6], cospi[32]);
-    u[6] = vrshlq_s32(u[6], v_bit);
-
-    u[7] = v[7];
-    u[8] = vaddq_s32(v[8], v[11]);
-    u[11] = vsubq_s32(v[8], v[11]);
-    u[9] = vaddq_s32(v[9], v[10]);
-    u[10] = vsubq_s32(v[9], v[10]);
-    u[12] = vsubq_s32(v[15], v[12]);
-    u[15] = vaddq_s32(v[15], v[12]);
-    u[13] = vsubq_s32(v[14], v[13]);
-    u[14] = vaddq_s32(v[14], v[13]);
-
-    // stage 4
-    u[0] = vmulq_n_s32(u[0], cospi[32]);
-    u[1] = vmulq_n_s32(u[1], cospi[32]);
-    v[0] = vaddq_s32(u[0], u[1]);
-    v[0] = vrshlq_s32(v[0], v_bit);
-
-    v[1] = vsubq_s32(u[0], u[1]);
-    v[1] = vrshlq_s32(v[1], v_bit);
-
-    v[2] = vmulq_n_s32(u[2], cospi[48]);
-    v[2] = vmlaq_n_s32(v[2], u[3], cospi[16]);
-    v[2] = vrshlq_s32(v[2], v_bit);
-
-    v[3] = vmulq_n_s32(u[3], cospi[48]);
-    v[3] = vmlsq_n_s32(v[3], u[2], cospi[16]);
-    v[3] = vrshlq_s32(v[3], v_bit);
-
-    v[4] = vaddq_s32(u[4], u[5]);
-    v[5] = vsubq_s32(u[4], u[5]);
-    v[6] = vsubq_s32(u[7], u[6]);
-    v[7] = vaddq_s32(u[7], u[6]);
-    v[8] = u[8];
-
-    v[9] = vmulq_n_s32(u[14], cospi[48]);
-    v[9] = vmlsq_n_s32(v[9], u[9], cospi[16]);
-    v[9] = vrshlq_s32(v[9], v_bit);
-
-    v[14] = vmulq_n_s32(u[9], cospi[48]);
-    v[14] = vmlaq_n_s32(v[14], u[14], cospi[16]);
-    v[14] = vrshlq_s32(v[14], v_bit);
-
-    v[10] = vmulq_n_s32(u[13], -cospi[16]);
-    v[10] = vmlsq_n_s32(v[10], u[10], cospi[48]);
-    v[10] = vrshlq_s32(v[10], v_bit);
-
-    v[13] = vmulq_n_s32(u[10], -cospi[16]);
-    v[13] = vmlaq_n_s32(v[13], u[13], cospi[48]);
-    v[13] = vrshlq_s32(v[13], v_bit);
-
-    v[11] = u[11];
-    v[12] = u[12];
-    v[15] = u[15];
-
-    // stage 5
-    u[0] = v[0];
-    u[1] = v[1];
-    u[2] = v[2];
-    u[3] = v[3];
-
-    u[4] = vmulq_n_s32(v[4], cospi[56]);
-    u[4] = vmlaq_n_s32(u[4], v[7], cospi[8]);
-    u[4] = vrshlq_s32(u[4], v_bit);
-
-    u[7] = vmulq_n_s32(v[7], cospi[56]);
-    u[7] = vmlsq_n_s32(u[7], v[4], cospi[8]);
-    u[7] = vrshlq_s32(u[7], v_bit);
-
-    u[5] = vmulq_n_s32(v[5], cospi[24]);
-    u[5] = vmlaq_n_s32(u[5], v[6], cospi[40]);
-    u[5] = vrshlq_s32(u[5], v_bit);
-
-    u[6] = vmulq_n_s32(v[6], cospi[24]);
-    u[6] = vmlsq_n_s32(u[6], v[5], cospi[40]);
-    u[6] = vrshlq_s32(u[6], v_bit);
-
-    u[8] = vaddq_s32(v[8], v[9]);
-    u[9] = vsubq_s32(v[8], v[9]);
-    u[10] = vsubq_s32(v[11], v[10]);
-    u[11] = vaddq_s32(v[11], v[10]);
-    u[12] = vaddq_s32(v[12], v[13]);
-    u[13] = vsubq_s32(v[12], v[13]);
-    u[14] = vsubq_s32(v[15], v[14]);
-    u[15] = vaddq_s32(v[15], v[14]);
-
-    // stage 6
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-
-    v[8] = vmulq_n_s32(u[8], cospi[60]);
-    v[8] = vmlaq_n_s32(v[8], u[15], cospi[4]);
-    v[8] = vrshlq_s32(v[8], v_bit);
-
-    v[15] = vmulq_n_s32(u[15], cospi[60]);
-    v[15] = vmlsq_n_s32(v[15], u[8], cospi[4]);
-    v[15] = vrshlq_s32(v[15], v_bit);
-
-    v[9] = vmulq_n_s32(u[9], cospi[28]);
-    v[9] = vmlaq_n_s32(v[9], u[14], cospi[36]);
-    v[9] = vrshlq_s32(v[9], v_bit);
-
-    v[14] = vmulq_n_s32(u[14], cospi[28]);
-    v[14] = vmlsq_n_s32(v[14], u[9], cospi[36]);
-    v[14] = vrshlq_s32(v[14], v_bit);
-
-    v[10] = vmulq_n_s32(u[10], cospi[44]);
-    v[10] = vmlaq_n_s32(v[10], u[13], cospi[20]);
-    v[10] = vrshlq_s32(v[10], v_bit);
-
-    v[13] = vmulq_n_s32(u[13], cospi[44]);
-    v[13] = vmlsq_n_s32(v[13], u[10], cospi[20]);
-    v[13] = vrshlq_s32(v[13], v_bit);
-
-    v[11] = vmulq_n_s32(u[11], cospi[12]);
-    v[11] = vmlaq_n_s32(v[11], u[12], cospi[52]);
-    v[11] = vrshlq_s32(v[11], v_bit);
-
-    v[12] = vmulq_n_s32(u[12], cospi[12]);
-    v[12] = vmlsq_n_s32(v[12], u[11], cospi[52]);
-    v[12] = vrshlq_s32(v[12], v_bit);
-
-    out[0 * col_num + col] = v[0];
-    out[1 * col_num + col] = v[8];
-    out[2 * col_num + col] = v[4];
-    out[3 * col_num + col] = v[12];
-    out[4 * col_num + col] = v[2];
-    out[5 * col_num + col] = v[10];
-    out[6 * col_num + col] = v[6];
-    out[7 * col_num + col] = v[14];
-    out[8 * col_num + col] = v[1];
-    out[9 * col_num + col] = v[9];
-    out[10 * col_num + col] = v[5];
-    out[11 * col_num + col] = v[13];
-    out[12 * col_num + col] = v[3];
-    out[13 * col_num + col] = v[11];
-    out[14 * col_num + col] = v[7];
-    out[15 * col_num + col] = v[15];
-  }
-}
-
-static void fadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
-                            const int num_cols) {
-  const int32_t *cospi = cospi_arr(bit);
-
-  const int32x4_t v_bit = vdupq_n_s32(-bit);
-
-  int32x4_t u[16], v[16], x, y;
-  int col;
-
-  for (col = 0; col < num_cols; ++col) {
-    // stage 0-1
-    u[0] = in[0 * num_cols + col];
-    u[1] = vnegq_s32(in[15 * num_cols + col]);
-    u[2] = vnegq_s32(in[7 * num_cols + col]);
-    u[3] = in[8 * num_cols + col];
-    u[4] = vnegq_s32(in[3 * num_cols + col]);
-    u[5] = in[12 * num_cols + col];
-    u[6] = in[4 * num_cols + col];
-    u[7] = vnegq_s32(in[11 * num_cols + col]);
-    u[8] = vnegq_s32(in[1 * num_cols + col]);
-    u[9] = in[14 * num_cols + col];
-    u[10] = in[6 * num_cols + col];
-    u[11] = vnegq_s32(in[9 * num_cols + col]);
-    u[12] = in[2 * num_cols + col];
-    u[13] = vnegq_s32(in[13 * num_cols + col]);
-    u[14] = vnegq_s32(in[5 * num_cols + col]);
-    u[15] = in[10 * num_cols + col];
-
-    // stage 2
-    v[0] = u[0];
-    v[1] = u[1];
-
-    x = vmulq_n_s32(u[2], cospi[32]);
-    y = vmulq_n_s32(u[3], cospi[32]);
-    v[2] = vaddq_s32(x, y);
-    v[2] = vrshlq_s32(v[2], v_bit);
-
-    v[3] = vsubq_s32(x, y);
-    v[3] = vrshlq_s32(v[3], v_bit);
-
-    v[4] = u[4];
-    v[5] = u[5];
-
-    x = vmulq_n_s32(u[6], cospi[32]);
-    y = vmulq_n_s32(u[7], cospi[32]);
-    v[6] = vaddq_s32(x, y);
-    v[6] = vrshlq_s32(v[6], v_bit);
-
-    v[7] = vsubq_s32(x, y);
-    v[7] = vrshlq_s32(v[7], v_bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-
-    x = vmulq_n_s32(u[10], cospi[32]);
-    y = vmulq_n_s32(u[11], cospi[32]);
-    v[10] = vaddq_s32(x, y);
-    v[10] = vrshlq_s32(v[10], v_bit);
-
-    v[11] = vsubq_s32(x, y);
-    v[11] = vrshlq_s32(v[11], v_bit);
-
-    v[12] = u[12];
-    v[13] = u[13];
-
-    x = vmulq_n_s32(u[14], cospi[32]);
-    y = vmulq_n_s32(u[15], cospi[32]);
-    v[14] = vaddq_s32(x, y);
-    v[14] = vrshlq_s32(v[14], v_bit);
-
-    v[15] = vsubq_s32(x, y);
-    v[15] = vrshlq_s32(v[15], v_bit);
-
-    // stage 3
-    u[0] = vaddq_s32(v[0], v[2]);
-    u[1] = vaddq_s32(v[1], v[3]);
-    u[2] = vsubq_s32(v[0], v[2]);
-    u[3] = vsubq_s32(v[1], v[3]);
-    u[4] = vaddq_s32(v[4], v[6]);
-    u[5] = vaddq_s32(v[5], v[7]);
-    u[6] = vsubq_s32(v[4], v[6]);
-    u[7] = vsubq_s32(v[5], v[7]);
-    u[8] = vaddq_s32(v[8], v[10]);
-    u[9] = vaddq_s32(v[9], v[11]);
-    u[10] = vsubq_s32(v[8], v[10]);
-    u[11] = vsubq_s32(v[9], v[11]);
-    u[12] = vaddq_s32(v[12], v[14]);
-    u[13] = vaddq_s32(v[13], v[15]);
-    u[14] = vsubq_s32(v[12], v[14]);
-    u[15] = vsubq_s32(v[13], v[15]);
-
-    // stage 4
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = half_btf_neon(&cospi[16], &u[4], &cospi[48], &u[5], v_bit);
-    v[7] = half_btf_neon(&cospi[16], &u[6], &cospi[48], &u[7], v_bit);
-    v[5] = half_btf_neon_m(&cospi[48], &u[4], &cospi[16], &u[5], v_bit);
-    v[6] = half_btf_neon_m(&cospi[16], &u[7], &cospi[48], &u[6], v_bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
-
-    v[12] = half_btf_neon(&cospi[16], &u[12], &cospi[48], &u[13], v_bit);
-    v[15] = half_btf_neon(&cospi[16], &u[14], &cospi[48], &u[15], v_bit);
-    v[13] = half_btf_neon_m(&cospi[48], &u[12], &cospi[16], &u[13], v_bit);
-    v[14] = half_btf_neon_m(&cospi[16], &u[15], &cospi[48], &u[14], v_bit);
-
-    // stage 5
-    u[0] = vaddq_s32(v[0], v[4]);
-    u[1] = vaddq_s32(v[1], v[5]);
-    u[2] = vaddq_s32(v[2], v[6]);
-    u[3] = vaddq_s32(v[3], v[7]);
-    u[4] = vsubq_s32(v[0], v[4]);
-    u[5] = vsubq_s32(v[1], v[5]);
-    u[6] = vsubq_s32(v[2], v[6]);
-    u[7] = vsubq_s32(v[3], v[7]);
-    u[8] = vaddq_s32(v[8], v[12]);
-    u[9] = vaddq_s32(v[9], v[13]);
-    u[10] = vaddq_s32(v[10], v[14]);
-    u[11] = vaddq_s32(v[11], v[15]);
-    u[12] = vsubq_s32(v[8], v[12]);
-    u[13] = vsubq_s32(v[9], v[13]);
-    u[14] = vsubq_s32(v[10], v[14]);
-    u[15] = vsubq_s32(v[11], v[15]);
-
-    // stage 6
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-
-    v[8] = half_btf_neon(&cospi[8], &u[8], &cospi[56], &u[9], v_bit);
-    v[13] = half_btf_neon(&cospi[8], &u[12], &cospi[56], &u[13], v_bit);
-    v[9] = half_btf_neon_m(&cospi[56], &u[8], &cospi[8], &u[9], v_bit);
-    v[12] = half_btf_neon_m(&cospi[8], &u[13], &cospi[56], &u[12], v_bit);
-
-    v[10] = half_btf_neon(&cospi[40], &u[10], &cospi[24], &u[11], v_bit);
-    v[15] = half_btf_neon(&cospi[40], &u[14], &cospi[24], &u[15], v_bit);
-    v[11] = half_btf_neon_m(&cospi[24], &u[10], &cospi[40], &u[11], v_bit);
-    v[14] = half_btf_neon_m(&cospi[40], &u[15], &cospi[24], &u[14], v_bit);
-
-    // stage 7
-    u[0] = vaddq_s32(v[0], v[8]);
-    u[1] = vaddq_s32(v[1], v[9]);
-    u[2] = vaddq_s32(v[2], v[10]);
-    u[3] = vaddq_s32(v[3], v[11]);
-    u[4] = vaddq_s32(v[4], v[12]);
-    u[5] = vaddq_s32(v[5], v[13]);
-    u[6] = vaddq_s32(v[6], v[14]);
-    u[7] = vaddq_s32(v[7], v[15]);
-    u[8] = vsubq_s32(v[0], v[8]);
-    u[9] = vsubq_s32(v[1], v[9]);
-    u[10] = vsubq_s32(v[2], v[10]);
-    u[11] = vsubq_s32(v[3], v[11]);
-    u[12] = vsubq_s32(v[4], v[12]);
-    u[13] = vsubq_s32(v[5], v[13]);
-    u[14] = vsubq_s32(v[6], v[14]);
-    u[15] = vsubq_s32(v[7], v[15]);
-
-    // stage 8
-    v[0] = half_btf_neon(&cospi[2], &u[0], &cospi[62], &u[1], v_bit);
-    v[1] = half_btf_neon_m(&cospi[62], &u[0], &cospi[2], &u[1], v_bit);
-    v[2] = half_btf_neon(&cospi[10], &u[2], &cospi[54], &u[3], v_bit);
-    v[3] = half_btf_neon_m(&cospi[54], &u[2], &cospi[10], &u[3], v_bit);
-    v[4] = half_btf_neon(&cospi[18], &u[4], &cospi[46], &u[5], v_bit);
-    v[5] = half_btf_neon_m(&cospi[46], &u[4], &cospi[18], &u[5], v_bit);
-    v[6] = half_btf_neon(&cospi[26], &u[6], &cospi[38], &u[7], v_bit);
-    v[7] = half_btf_neon_m(&cospi[38], &u[6], &cospi[26], &u[7], v_bit);
-    v[8] = half_btf_neon(&cospi[34], &u[8], &cospi[30], &u[9], v_bit);
-    v[9] = half_btf_neon_m(&cospi[30], &u[8], &cospi[34], &u[9], v_bit);
-    v[10] = half_btf_neon(&cospi[42], &u[10], &cospi[22], &u[11], v_bit);
-    v[11] = half_btf_neon_m(&cospi[22], &u[10], &cospi[42], &u[11], v_bit);
-    v[12] = half_btf_neon(&cospi[50], &u[12], &cospi[14], &u[13], v_bit);
-    v[13] = half_btf_neon_m(&cospi[14], &u[12], &cospi[50], &u[13], v_bit);
-    v[14] = half_btf_neon(&cospi[58], &u[14], &cospi[6], &u[15], v_bit);
-    v[15] = half_btf_neon_m(&cospi[6], &u[14], &cospi[58], &u[15], v_bit);
-
-    // stage 9
-    out[0 * num_cols + col] = v[1];
-    out[1 * num_cols + col] = v[14];
-    out[2 * num_cols + col] = v[3];
-    out[3 * num_cols + col] = v[12];
-    out[4 * num_cols + col] = v[5];
-    out[5 * num_cols + col] = v[10];
-    out[6 * num_cols + col] = v[7];
-    out[7 * num_cols + col] = v[8];
-    out[8 * num_cols + col] = v[9];
-    out[9 * num_cols + col] = v[6];
-    out[10 * num_cols + col] = v[11];
-    out[11 * num_cols + col] = v[4];
-    out[12 * num_cols + col] = v[13];
-    out[13 * num_cols + col] = v[2];
-    out[14 * num_cols + col] = v[15];
-    out[15 * num_cols + col] = v[0];
-  }
-}
-
-static void col_txfm_16x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
-  // Note:
-  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
-  //  instead of 4 columns
-  col_txfm_8x8_rounding(&in[0], v_shift);
-  col_txfm_8x8_rounding(&in[16], v_shift);
-  col_txfm_8x8_rounding(&in[32], v_shift);
-  col_txfm_8x8_rounding(&in[48], v_shift);
-}
-
-static void col_txfm_8x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
-  col_txfm_8x8_rounding(&in[0], v_shift);
-  col_txfm_8x8_rounding(&in[16], v_shift);
-}
-
-static void write_buffer_16x16(const int32x4_t *in, int32_t *output) {
-  const int size_8x8 = 16 * 4;
-  write_buffer_8x8(&in[0], output);
-  output += size_8x8;
-  write_buffer_8x8(&in[16], output);
-  output += size_8x8;
-  write_buffer_8x8(&in[32], output);
-  output += size_8x8;
-  write_buffer_8x8(&in[48], output);
-}
-static void idtx16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
-                           int col_num) {
-  (void)bit;
-  int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
-  int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
-  int32x4_t a_low;
-
-  int num_iters = 16 * col_num;
-  for (int i = 0; i < num_iters; i++) {
-    a_low = vmulq_s32(in[i], fact);
-    a_low = vaddq_s32(a_low, offset);
-    out[i] = vshrq_n_s32(a_low, NewSqrt2Bits);
-  }
-}
-void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
-                               TX_TYPE tx_type, int bd) {
-  int32x4_t in[64], out[64];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
-  const int txw_idx = get_txw_idx(TX_16X16);
-  const int txh_idx = get_txh_idx(TX_16X16);
-  const int col_num = 4;
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case FLIPADST_DCT:
-      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case IDTX:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case V_DCT:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case H_DCT:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case V_ADST:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case H_ADST:
-      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
-      col_txfm_16x16_rounding(out, &v_shift);
-      transpose_16x16(out, in);
-      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
-      write_buffer_16x16(out, coeff);
-      break;
-    default: assert(0);
-  }
-  (void)bd;
-}
-
-static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
-  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
-  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
-}
-
-typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
-                                      const int num_cols);
-
-static const fwd_transform_1d_neon col_highbd_txfm8x8_arr[TX_TYPES] = {
-  fdct8x8_neon,   // DCT_DCT
-  fadst8x8_neon,  // ADST_DCT
-  fdct8x8_neon,   // DCT_ADST
-  fadst8x8_neon,  // ADST_ADST
-  fadst8x8_neon,  // FLIPADST_DCT
-  fdct8x8_neon,   // DCT_FLIPADST
-  fadst8x8_neon,  // FLIPADST_FLIPADST
-  fadst8x8_neon,  // ADST_FLIPADST
-  fadst8x8_neon,  // FLIPADST_ADST
-  idtx8x8_neon,   // IDTX
-  fdct8x8_neon,   // V_DCT
-  idtx8x8_neon,   // H_DCT
-  fadst8x8_neon,  // V_ADST
-  idtx8x8_neon,   // H_ADST
-  fadst8x8_neon,  // V_FLIPADST
-  idtx8x8_neon    // H_FLIPADST
-};
-#if !CONFIG_REALTIME_ONLY
-static const fwd_transform_1d_neon row_highbd_txfm32x8_arr[TX_TYPES] = {
-  fdct8x8_neon,   // DCT_DCT
-  NULL,           // ADST_DCT
-  NULL,           // DCT_ADST
-  NULL,           // ADST_ADST
-  NULL,           // FLIPADST_DCT
-  NULL,           // DCT_FLIPADST
-  NULL,           // FLIPADST_FLIPADST
-  NULL,           // ADST_FLIPADST
-  NULL,           // FLIPADST-ADST
-  idtx32x8_neon,  // IDTX
-  NULL,           // V_DCT
-  NULL,           // H_DCT
-  NULL,           // V_ADST
-  NULL,           // H_ADST
-  NULL,           // V_FLIPADST
-  NULL,           // H_FLIPADST
-};
-#endif
-static const fwd_transform_1d_neon col_highbd_txfm4x8_arr[TX_TYPES] = {
-  fdct4x8_neon,   // DCT_DCT
-  fadst8x8_neon,  // ADST_DCT
-  fdct4x8_neon,   // DCT_ADST
-  fadst8x8_neon,  // ADST_ADST
-  fadst8x8_neon,  // FLIPADST_DCT
-  fdct4x8_neon,   // DCT_FLIPADST
-  fadst8x8_neon,  // FLIPADST_FLIPADST
-  fadst8x8_neon,  // ADST_FLIPADST
-  fadst8x8_neon,  // FLIPADST_ADST
-  idtx8x8_neon,   // IDTX
-  fdct4x8_neon,   // V_DCT
-  idtx8x8_neon,   // H_DCT
-  fadst8x8_neon,  // V_ADST
-  idtx8x8_neon,   // H_ADST
-  fadst8x8_neon,  // V_FLIPADST
-  idtx8x8_neon    // H_FLIPADST
-};
-
-static const fwd_transform_1d_neon row_highbd_txfm8x16_arr[TX_TYPES] = {
-  fdct16x16_neon,   // DCT_DCT
-  fdct16x16_neon,   // ADST_DCT
-  fadst16x16_neon,  // DCT_ADST
-  fadst16x16_neon,  // ADST_ADST
-  fdct16x16_neon,   // FLIPADST_DCT
-  fadst16x16_neon,  // DCT_FLIPADST
-  fadst16x16_neon,  // FLIPADST_FLIPADST
-  fadst16x16_neon,  // ADST_FLIPADST
-  fadst16x16_neon,  // FLIPADST_ADST
-  idtx16x16_neon,   // IDTX
-  idtx16x16_neon,   // V_DCT
-  fdct16x16_neon,   // H_DCT
-  idtx16x16_neon,   // V_ADST
-  fadst16x16_neon,  // H_ADST
-  idtx16x16_neon,   // V_FLIPADST
-  fadst16x16_neon   // H_FLIPADST
-};
-
-static const fwd_transform_1d_neon col_highbd_txfm8x16_arr[TX_TYPES] = {
-  fdct16x16_neon,   // DCT_DCT
-  fadst16x16_neon,  // ADST_DCT
-  fdct16x16_neon,   // DCT_ADST
-  fadst16x16_neon,  // ADST_ADST
-  fadst16x16_neon,  // FLIPADST_DCT
-  fdct16x16_neon,   // DCT_FLIPADST
-  fadst16x16_neon,  // FLIPADST_FLIPADST
-  fadst16x16_neon,  // ADST_FLIPADST
-  fadst16x16_neon,  // FLIPADST_ADST
-  idtx16x16_neon,   // IDTX
-  fdct16x16_neon,   // V_DCT
-  idtx16x16_neon,   // H_DCT
-  fadst16x16_neon,  // V_ADST
-  idtx16x16_neon,   // H_ADST
-  fadst16x16_neon,  // V_FLIPADST
-  idtx16x16_neon    // H_FLIPADST
-};
-static const fwd_transform_1d_neon row_highbd_txfm8x8_arr[TX_TYPES] = {
-  fdct8x8_neon,   // DCT_DCT
-  fdct8x8_neon,   // ADST_DCT
-  fadst8x8_neon,  // DCT_ADST
-  fadst8x8_neon,  // ADST_ADST
-  fdct8x8_neon,   // FLIPADST_DCT
-  fadst8x8_neon,  // DCT_FLIPADST
-  fadst8x8_neon,  // FLIPADST_FLIPADST
-  fadst8x8_neon,  // ADST_FLIPADST
-  fadst8x8_neon,  // FLIPADST_ADST
-  idtx8x8_neon,   // IDTX
-  idtx8x8_neon,   // V_DCT
-  fdct8x8_neon,   // H_DCT
-  idtx8x8_neon,   // V_ADST
-  fadst8x8_neon,  // H_ADST
-  idtx8x8_neon,   // V_FLIPADST
-  fadst8x8_neon   // H_FLIPADST
-};
-
-static const fwd_transform_1d_neon row_highbd_txfm4x8_arr[TX_TYPES] = {
-  fdct4x8_neon,   // DCT_DCT
-  fdct4x8_neon,   // ADST_DCT
-  fadst8x8_neon,  // DCT_ADST
-  fadst8x8_neon,  // ADST_ADST
-  fdct4x8_neon,   // FLIPADST_DCT
-  fadst8x8_neon,  // DCT_FLIPADST
-  fadst8x8_neon,  // FLIPADST_FLIPADST
-  fadst8x8_neon,  // ADST_FLIPADST
-  fadst8x8_neon,  // FLIPADST_ADST
-  idtx8x8_neon,   // IDTX
-  idtx8x8_neon,   // V_DCT
-  fdct4x8_neon,   // H_DCT
-  idtx8x8_neon,   // V_ADST
-  fadst8x8_neon,  // H_ADST
-  idtx8x8_neon,   // V_FLIPADST
-  fadst8x8_neon   // H_FLIPADST
-};
-
-static const fwd_transform_1d_neon row_highbd_txfm4x4_arr[TX_TYPES] = {
-  fdct4x4_neon,   // DCT_DCT
-  fdct4x4_neon,   // ADST_DCT
-  fadst4x4_neon,  // DCT_ADST
-  fadst4x4_neon,  // ADST_ADST
-  fdct4x4_neon,   // FLIPADST_DCT
-  fadst4x4_neon,  // DCT_FLIPADST
-  fadst4x4_neon,  // FLIPADST_FLIPADST
-  fadst4x4_neon,  // ADST_FLIPADST
-  fadst4x4_neon,  // FLIPADST_ADST
-  idtx4x4_neon,   // IDTX
-  idtx4x4_neon,   // V_DCT
-  fdct4x4_neon,   // H_DCT
-  idtx4x4_neon,   // V_ADST
-  fadst4x4_neon,  // H_ADST
-  idtx4x4_neon,   // V_FLIPADST
-  fadst4x4_neon   // H_FLIPADST
-};
-
-static const fwd_transform_1d_neon col_highbd_txfm4x4_arr[TX_TYPES] = {
-  fdct4x4_neon,   // DCT_DCT
-  fadst4x4_neon,  // ADST_DCT
-  fdct4x4_neon,   // DCT_ADST
-  fadst4x4_neon,  // ADST_ADST
-  fadst4x4_neon,  // FLIPADST_DCT
-  fdct4x4_neon,   // DCT_FLIPADST
-  fadst4x4_neon,  // FLIPADST_FLIPADST
-  fadst4x4_neon,  // ADST_FLIPADST
-  fadst4x4_neon,  // FLIPADST_ADST
-  idtx4x4_neon,   // IDTX
-  fdct4x4_neon,   // V_DCT
-  idtx4x4_neon,   // H_DCT
-  fadst4x4_neon,  // V_ADST
-  idtx4x4_neon,   // H_ADST
-  fadst4x4_neon,  // V_FLIPADST
-  idtx4x4_neon    // H_FLIPADST
-};
-
-void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
-                         const int stride) {
-  int32x4_t buf0[32];
-  int32x4_t buf1[32];
-  const int32_t *cospi;
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
-  int startidx = 0 * stride;
-  int endidx = 31 * stride;
-  // stage 0
-  // stage 1
-  buf1[0] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[31] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[1] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[30] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[2] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[29] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[3] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[28] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[4] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[27] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[5] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[26] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[6] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[25] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[7] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[24] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[8] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[23] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[9] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[22] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[10] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[21] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[11] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[20] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[12] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[19] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[13] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[18] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[14] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[17] = vsubq_s32(input[startidx], input[endidx]);
-  startidx += stride;
-  endidx -= stride;
-  buf1[15] = vaddq_s32(input[startidx], input[endidx]);
-  buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+  // stage 0-1
+  u0 = in[0];
+  u1 = in[7];
+  u2 = in[3];
+  u3 = in[4];
+  u4 = in[1];
+  u5 = in[6];
+  u6 = in[2];
+  u7 = in[5];
 
   // stage 2
-  cospi = cospi_arr(cos_bit);
-  buf0[0] = vaddq_s32(buf1[0], buf1[15]);
-  buf0[15] = vsubq_s32(buf1[0], buf1[15]);
-  buf0[1] = vaddq_s32(buf1[1], buf1[14]);
-  buf0[14] = vsubq_s32(buf1[1], buf1[14]);
-  buf0[2] = vaddq_s32(buf1[2], buf1[13]);
-  buf0[13] = vsubq_s32(buf1[2], buf1[13]);
-  buf0[3] = vaddq_s32(buf1[3], buf1[12]);
-  buf0[12] = vsubq_s32(buf1[3], buf1[12]);
-  buf0[4] = vaddq_s32(buf1[4], buf1[11]);
-  buf0[11] = vsubq_s32(buf1[4], buf1[11]);
-  buf0[5] = vaddq_s32(buf1[5], buf1[10]);
-  buf0[10] = vsubq_s32(buf1[5], buf1[10]);
-  buf0[6] = vaddq_s32(buf1[6], buf1[9]);
-  buf0[9] = vsubq_s32(buf1[6], buf1[9]);
-  buf0[7] = vaddq_s32(buf1[7], buf1[8]);
-  buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+  v0 = u0;
+  v1 = u1;
+  butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
+  v4 = u4;
+  v5 = u5;
+  butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
+
+  // stage 3
+  u0 = vaddq_s32(v0, v2);
+  u1 = vsubq_s32(v3, v1);
+  u2 = vsubq_s32(v0, v2);
+  u3 = vaddq_s32(v1, v3);
+  u4 = vsubq_s32(v6, v4);
+  u5 = vaddq_s32(v5, v7);
+  u6 = vaddq_s32(v4, v6);
+  u7 = vsubq_s32(v5, v7);
+
+  // stage 4
+  v0 = u0;
+  v1 = u1;
+  v2 = u2;
+  v3 = u3;
+
+  butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
+  butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
+
+  // stage 5
+  u0 = vaddq_s32(v0, v4);
+  u1 = vaddq_s32(v1, v5);
+  u2 = vaddq_s32(v2, v6);
+  u3 = vsubq_s32(v7, v3);
+  u4 = vsubq_s32(v0, v4);
+  u5 = vsubq_s32(v1, v5);
+  u6 = vsubq_s32(v2, v6);
+  u7 = vaddq_s32(v3, v7);
+
+  // stage 6
+  butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
+  butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
+  butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
+  butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
+
+  // stage 7
+  out[0] = v1;
+  out[1] = v6;
+  out[2] = v3;
+  out[3] = v4;
+  out[4] = v5;
+  out[5] = v2;
+  out[6] = v7;
+  out[7] = v0;
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
+                                                       int32x4_t *out,
+                                                       int bit) {
+  (void)bit;
+  out[0] = vshlq_n_s32(in[0], 1);
+  out[1] = vshlq_n_s32(in[1], 1);
+  out[2] = vshlq_n_s32(in[2], 1);
+  out[3] = vshlq_n_s32(in[3], 1);
+  out[4] = vshlq_n_s32(in[4], 1);
+  out[5] = vshlq_n_s32(in[5], 1);
+  out[6] = vshlq_n_s32(in[6], 1);
+  out[7] = vshlq_n_s32(in[7], 1);
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
+                                                  int32x4_t *out, int bit,
+                                                  int howmany) {
+  const int stride = 8;
+  int i = 0;
+  do {
+    highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
+                                                   int32x4_t *out, int bit,
+                                                   int howmany) {
+  const int stride = 8;
+  int i = 0;
+  do {
+    highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
+                                                       int32x4_t *out, int bit,
+                                                       int howmany) {
+  (void)bit;
+  const int stride = 8;
+  int i = 0;
+  do {
+    highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+  // Workspaces for column/row-wise transforms.
+  int32x4_t buf0[16], buf1[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 0);
+      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, buf0, stride, 1);
+      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+      shift_right_1_round_s32_x4(buf0, buf0, 16);
+      transpose_arrays_s32_8x8(buf0, buf1);
+      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+      store_buffer_8x8(buf1, coeff, /*stride=*/8);
+      break;
+    default: assert(0);
+  }
+}
+
+static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
+                                  int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u[16], v[16];
+
+  // stage 1
+  butterfly_dct_pre(in, u, 16);
+
+  // stage 2
+  butterfly_dct_pre(u, v, 8);
+  v[8] = u[8];
+  v[9] = u[9];
+  butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
+  butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
+  v[14] = u[14];
+  v[15] = u[15];
+
+  // stage 3
+  butterfly_dct_pre(v, u, 4);
+  u[4] = v[4];
+  butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
+  u[7] = v[7];
+  butterfly_dct_post(v + 8, v + 8, u + 8, 8);
+
+  // stage 4
+  butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
+  butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
+  butterfly_dct_post(u + 4, u + 4, v + 4, 4);
+  v[8] = u[8];
+  butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
+  butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
+  v[11] = u[11];
+  v[12] = u[12];
+  v[15] = u[15];
+
+  // stage 5
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+  butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
+  butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
+  butterfly_dct_post(v + 8, v + 8, u + 8, 4);
+  butterfly_dct_post(v + 12, v + 12, u + 12, 4);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+  butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
+  butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
+  butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
+  butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
+
+  out[0] = v[0];
+  out[1] = v[8];
+  out[2] = v[4];
+  out[3] = v[12];
+  out[4] = v[2];
+  out[5] = v[10];
+  out[6] = v[6];
+  out[7] = v[14];
+  out[8] = v[1];
+  out[9] = v[9];
+  out[10] = v[5];
+  out[11] = v[13];
+  out[12] = v[3];
+  out[13] = v[11];
+  out[14] = v[7];
+  out[15] = v[15];
+}
+
+static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
+                                   int bit) {
+  const int32_t *const cospi = cospi_arr_s32(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u[16], v[16];
+
+  // stage 0-1
+  u[0] = in[0];
+  u[1] = in[15];
+  u[2] = in[7];
+  u[3] = in[8];
+  u[4] = in[3];
+  u[5] = in[12];
+  u[6] = in[4];
+  u[7] = in[11];
+  u[8] = in[1];
+  u[9] = in[14];
+  u[10] = in[6];
+  u[11] = in[9];
+  u[12] = in[2];
+  u[13] = in[13];
+  u[14] = in[5];
+  u[15] = in[10];
+
+  // stage 2
+  v[0] = u[0];
+  v[1] = u[1];
+  butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
+  v[4] = u[4];
+  v[5] = u[5];
+  butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
+  v[8] = u[8];
+  v[9] = u[9];
+  butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
+  v[12] = u[12];
+  v[13] = u[13];
+  butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
+
+  // stage 3
+  u[0] = vaddq_s32(v[0], v[2]);
+  u[1] = vsubq_s32(v[3], v[1]);
+  u[2] = vsubq_s32(v[0], v[2]);
+  u[3] = vaddq_s32(v[1], v[3]);
+  u[4] = vsubq_s32(v[6], v[4]);
+  u[5] = vaddq_s32(v[5], v[7]);
+  u[6] = vaddq_s32(v[4], v[6]);
+  u[7] = vsubq_s32(v[5], v[7]);
+  u[8] = vsubq_s32(v[10], v[8]);
+  u[9] = vaddq_s32(v[9], v[11]);
+  u[10] = vaddq_s32(v[8], v[10]);
+  u[11] = vsubq_s32(v[9], v[11]);
+  u[12] = vaddq_s32(v[12], v[14]);
+  u[13] = vsubq_s32(v[15], v[13]);
+  u[14] = vsubq_s32(v[12], v[14]);
+  u[15] = vaddq_s32(v[13], v[15]);
+
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
+  butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
+
+  butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
+  butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
+
+  // stage 5
+  u[0] = vaddq_s32(v[0], v[4]);
+  u[1] = vaddq_s32(v[1], v[5]);
+  u[2] = vaddq_s32(v[2], v[6]);
+  u[3] = vsubq_s32(v[7], v[3]);
+  u[4] = vsubq_s32(v[0], v[4]);
+  u[5] = vsubq_s32(v[1], v[5]);
+  u[6] = vsubq_s32(v[2], v[6]);
+  u[7] = vaddq_s32(v[3], v[7]);
+  u[8] = vaddq_s32(v[8], v[12]);
+  u[9] = vaddq_s32(v[9], v[13]);
+  u[10] = vsubq_s32(v[14], v[10]);
+  u[11] = vaddq_s32(v[11], v[15]);
+  u[12] = vsubq_s32(v[8], v[12]);
+  u[13] = vsubq_s32(v[9], v[13]);
+  u[14] = vaddq_s32(v[10], v[14]);
+  u[15] = vsubq_s32(v[11], v[15]);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+
+  butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
+  butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
+  butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
+  butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
+
+  // stage 7
+  u[0] = vaddq_s32(v[0], v[8]);
+  u[1] = vaddq_s32(v[1], v[9]);
+  u[2] = vaddq_s32(v[2], v[10]);
+  u[3] = vaddq_s32(v[3], v[11]);
+  u[4] = vaddq_s32(v[4], v[12]);
+  u[5] = vaddq_s32(v[5], v[13]);
+  u[6] = vaddq_s32(v[6], v[14]);
+  u[7] = vsubq_s32(v[15], v[7]);
+  u[8] = vsubq_s32(v[0], v[8]);
+  u[9] = vsubq_s32(v[1], v[9]);
+  u[10] = vsubq_s32(v[2], v[10]);
+  u[11] = vsubq_s32(v[3], v[11]);
+  u[12] = vsubq_s32(v[4], v[12]);
+  u[13] = vsubq_s32(v[5], v[13]);
+  u[14] = vsubq_s32(v[6], v[14]);
+  u[15] = vaddq_s32(v[7], v[15]);
+
+  // stage 8
+  butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
+  butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
+  butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
+  butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
+  butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
+  butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
+  butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
+  butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
+
+  // stage 9
+  out[0] = v[1];
+  out[1] = v[14];
+  out[2] = v[3];
+  out[3] = v[12];
+  out[4] = v[5];
+  out[5] = v[10];
+  out[6] = v[7];
+  out[7] = v[8];
+  out[8] = v[9];
+  out[9] = v[6];
+  out[10] = v[11];
+  out[11] = v[4];
+  out[12] = v[13];
+  out[13] = v[2];
+  out[14] = v[15];
+  out[15] = v[0];
+}
+
+static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
+                                       int bit) {
+  (void)bit;
+  const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+  const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+
+  for (int i = 0; i < 16; i++) {
+    int32x4_t a = vmulq_s32(in[i], fact);
+    a = vaddq_s32(a, offset);
+    out[i] = vshrq_n_s32(a, NewSqrt2Bits);
+  }
+}
+
+static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+                                  const int howmany) {
+  const int stride = 16;
+  int i = 0;
+  do {
+    highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+                                   int howmany) {
+  const int stride = 16;
+  int i = 0;
+  do {
+    highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
+                                       int bit, int howmany) {
+  const int stride = 16;
+  int i = 0;
+  do {
+    highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
+  } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+  // Workspaces for column/row-wise transforms.
+  int32x4_t buf0[64], buf1[64];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 0);
+      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, buf0, stride, 1);
+      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+      shift_right_2_round_s32_x4(buf0, buf0, 64);
+      transpose_arrays_s32_16x16(buf0, buf1);
+      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+      store_buffer_16x16(buf1, coeff, /*stride=*/16);
+      break;
+    default: assert(0);
+  }
+}
+
+typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
+                                          int stride, int bit, int lr_flip);
+typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
+                                               int32x4_t *out, int stride,
+                                               int bit, int lr_flip,
+                                               int howmany, int hm_stride);
+
+typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
+                                          int bit, int stride);
+typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
+                                               int32_t *out, int bit,
+                                               int howmany, int hm_stride,
+                                               int stride);
+
+// Construct component kernels that include the load_buffer and store_buffer
+// stages to avoid the need to spill loaded data to the stack between these and
+// the txfm kernel calls.
+// The TRANSFORM_*_ONE cases are only ever called in situations where the
+// howmany parameter would be one, so no need for the loop at all in these
+// cases.
+
+#define TRANSFORM_COL_ONE(name, n)                                    \
+  static void highbd_##name##_col_neon(const int16_t *input,          \
+                                       int32x4_t *output, int stride, \
+                                       int cos_bit, int lr_flip) {    \
+    int32x4_t buf0[n];                                                \
+    load_buffer_4x##n(input, buf0, stride, lr_flip);                  \
+    highbd_##name##_x4_neon(buf0, output, cos_bit);                   \
+  }
+
+#define TRANSFORM_COL_MANY(name, n)                                     \
+  static void highbd_##name##_col_many_neon(                            \
+      const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
+      int lr_flip, int howmany, int hm_stride) {                        \
+    int i = 0;                                                          \
+    do {                                                                \
+      int32x4_t buf0[n];                                                \
+      load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip);          \
+      highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit);   \
+    } while (++i < howmany);                                            \
+  }
+
+#define TRANSFORM_ROW_ONE(name, n)                                        \
+  static void highbd_##name##_row_neon(                                   \
+      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+    int32x4_t buf0[n];                                                    \
+    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
+    store_buffer_##n##x4(buf0, output, stride);                           \
+  }
+
+#define TRANSFORM_ROW_RECT_ONE(name, n)                                   \
+  static void highbd_##name##_row_rect_neon(                              \
+      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+    int32x4_t buf0[n];                                                    \
+    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
+    round_rect_array_s32_neon(buf0, buf0, (n));                           \
+    store_buffer_##n##x4(buf0, output, stride);                           \
+  }
+
+#define TRANSFORM_ROW_MANY(name, n)                                      \
+  static void highbd_##name##_row_many_neon(                             \
+      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+      int hm_stride, int stride) {                                       \
+    int i = 0;                                                           \
+    do {                                                                 \
+      int32x4_t buf0[n];                                                 \
+      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
+      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
+    } while (++i < howmany);                                             \
+  }
+
+#define TRANSFORM_ROW_RECT_MANY(name, n)                                 \
+  static void highbd_##name##_row_rect_many_neon(                        \
+      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+      int hm_stride, int stride) {                                       \
+    int i = 0;                                                           \
+    do {                                                                 \
+      int32x4_t buf0[n];                                                 \
+      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
+      round_rect_array_s32_neon(buf0, buf0, (n));                        \
+      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
+    } while (++i < howmany);                                             \
+  }
+
+TRANSFORM_COL_ONE(fdct8, 8)
+TRANSFORM_COL_ONE(fadst8, 8)
+TRANSFORM_COL_ONE(fidentity8, 8)
+
+TRANSFORM_COL_MANY(fdct4, 4)
+TRANSFORM_COL_MANY(fdct8, 8)
+TRANSFORM_COL_MANY(fdct16, 16)
+TRANSFORM_COL_MANY(fadst4, 4)
+TRANSFORM_COL_MANY(fadst8, 8)
+TRANSFORM_COL_MANY(fadst16, 16)
+TRANSFORM_COL_MANY(fidentity4, 4)
+TRANSFORM_COL_MANY(fidentity8, 8)
+TRANSFORM_COL_MANY(fidentity16, 16)
+
+TRANSFORM_ROW_ONE(fdct16, 16)
+TRANSFORM_ROW_ONE(fadst16, 16)
+TRANSFORM_ROW_ONE(fidentity16, 16)
+
+TRANSFORM_ROW_RECT_ONE(fdct8, 8)
+TRANSFORM_ROW_RECT_ONE(fadst8, 8)
+TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
+
+#if !CONFIG_REALTIME_ONLY
+TRANSFORM_ROW_MANY(fdct4, 4)
+TRANSFORM_ROW_MANY(fdct8, 8)
+TRANSFORM_ROW_MANY(fadst4, 4)
+TRANSFORM_ROW_MANY(fadst8, 8)
+TRANSFORM_ROW_MANY(fidentity4, 4)
+TRANSFORM_ROW_MANY(fidentity8, 8)
+#endif
+
+TRANSFORM_ROW_RECT_MANY(fdct4, 4)
+TRANSFORM_ROW_RECT_MANY(fdct8, 8)
+TRANSFORM_ROW_RECT_MANY(fdct16, 16)
+TRANSFORM_ROW_RECT_MANY(fadst4, 4)
+TRANSFORM_ROW_RECT_MANY(fadst8, 8)
+TRANSFORM_ROW_RECT_MANY(fadst16, 16)
+TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
+TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
+TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm8_xn_arr[TX_TYPES] = {
+      highbd_fdct8_col_many_neon,       // DCT_DCT
+      highbd_fadst8_col_many_neon,      // ADST_DCT
+      highbd_fdct8_col_many_neon,       // DCT_ADST
+      highbd_fadst8_col_many_neon,      // ADST_ADST
+      highbd_fadst8_col_many_neon,      // FLIPADST_DCT
+      highbd_fdct8_col_many_neon,       // DCT_FLIPADST
+      highbd_fadst8_col_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst8_col_many_neon,      // ADST_FLIPADST
+      highbd_fadst8_col_many_neon,      // FLIPADST_ADST
+      highbd_fidentity8_col_many_neon,  // IDTX
+      highbd_fdct8_col_many_neon,       // V_DCT
+      highbd_fidentity8_col_many_neon,  // H_DCT
+      highbd_fadst8_col_many_neon,      // V_ADST
+      highbd_fidentity8_col_many_neon,  // H_ADST
+      highbd_fadst8_col_many_neon,      // V_FLIPADST
+      highbd_fidentity8_col_many_neon   // H_FLIPADST
+    };
+
+static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
+  highbd_fdct8_col_neon,       // DCT_DCT
+  highbd_fadst8_col_neon,      // ADST_DCT
+  highbd_fdct8_col_neon,       // DCT_ADST
+  highbd_fadst8_col_neon,      // ADST_ADST
+  highbd_fadst8_col_neon,      // FLIPADST_DCT
+  highbd_fdct8_col_neon,       // DCT_FLIPADST
+  highbd_fadst8_col_neon,      // FLIPADST_FLIPADST
+  highbd_fadst8_col_neon,      // ADST_FLIPADST
+  highbd_fadst8_col_neon,      // FLIPADST_ADST
+  highbd_fidentity8_col_neon,  // IDTX
+  highbd_fdct8_col_neon,       // V_DCT
+  highbd_fidentity8_col_neon,  // H_DCT
+  highbd_fadst8_col_neon,      // V_ADST
+  highbd_fidentity8_col_neon,  // H_ADST
+  highbd_fadst8_col_neon,      // V_FLIPADST
+  highbd_fidentity8_col_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm16_xn_arr[TX_TYPES] = {
+      highbd_fdct16_col_many_neon,       // DCT_DCT
+      highbd_fadst16_col_many_neon,      // ADST_DCT
+      highbd_fdct16_col_many_neon,       // DCT_ADST
+      highbd_fadst16_col_many_neon,      // ADST_ADST
+      highbd_fadst16_col_many_neon,      // FLIPADST_DCT
+      highbd_fdct16_col_many_neon,       // DCT_FLIPADST
+      highbd_fadst16_col_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst16_col_many_neon,      // ADST_FLIPADST
+      highbd_fadst16_col_many_neon,      // FLIPADST_ADST
+      highbd_fidentity16_col_many_neon,  // IDTX
+      highbd_fdct16_col_many_neon,       // V_DCT
+      highbd_fidentity16_col_many_neon,  // H_DCT
+      highbd_fadst16_col_many_neon,      // V_ADST
+      highbd_fidentity16_col_many_neon,  // H_ADST
+      highbd_fadst16_col_many_neon,      // V_FLIPADST
+      highbd_fidentity16_col_many_neon   // H_FLIPADST
+    };
+
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm4_xn_arr[TX_TYPES] = {
+      highbd_fdct4_col_many_neon,       // DCT_DCT
+      highbd_fadst4_col_many_neon,      // ADST_DCT
+      highbd_fdct4_col_many_neon,       // DCT_ADST
+      highbd_fadst4_col_many_neon,      // ADST_ADST
+      highbd_fadst4_col_many_neon,      // FLIPADST_DCT
+      highbd_fdct4_col_many_neon,       // DCT_FLIPADST
+      highbd_fadst4_col_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst4_col_many_neon,      // ADST_FLIPADST
+      highbd_fadst4_col_many_neon,      // FLIPADST_ADST
+      highbd_fidentity4_col_many_neon,  // IDTX
+      highbd_fdct4_col_many_neon,       // V_DCT
+      highbd_fidentity4_col_many_neon,  // H_DCT
+      highbd_fadst4_col_many_neon,      // V_ADST
+      highbd_fidentity4_col_many_neon,  // H_ADST
+      highbd_fadst4_col_many_neon,      // V_FLIPADST
+      highbd_fidentity4_col_many_neon   // H_FLIPADST
+    };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
+  highbd_fdct16_row_neon,       // DCT_DCT
+  highbd_fdct16_row_neon,       // ADST_DCT
+  highbd_fadst16_row_neon,      // DCT_ADST
+  highbd_fadst16_row_neon,      // ADST_ADST
+  highbd_fdct16_row_neon,       // FLIPADST_DCT
+  highbd_fadst16_row_neon,      // DCT_FLIPADST
+  highbd_fadst16_row_neon,      // FLIPADST_FLIPADST
+  highbd_fadst16_row_neon,      // ADST_FLIPADST
+  highbd_fadst16_row_neon,      // FLIPADST_ADST
+  highbd_fidentity16_row_neon,  // IDTX
+  highbd_fidentity16_row_neon,  // V_DCT
+  highbd_fdct16_row_neon,       // H_DCT
+  highbd_fidentity16_row_neon,  // V_ADST
+  highbd_fadst16_row_neon,      // H_ADST
+  highbd_fidentity16_row_neon,  // V_FLIPADST
+  highbd_fadst16_row_neon       // H_FLIPADST
+};
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
+      highbd_fdct16_row_rect_many_neon,       // DCT_DCT
+      highbd_fdct16_row_rect_many_neon,       // ADST_DCT
+      highbd_fadst16_row_rect_many_neon,      // DCT_ADST
+      highbd_fadst16_row_rect_many_neon,      // ADST_ADST
+      highbd_fdct16_row_rect_many_neon,       // FLIPADST_DCT
+      highbd_fadst16_row_rect_many_neon,      // DCT_FLIPADST
+      highbd_fadst16_row_rect_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst16_row_rect_many_neon,      // ADST_FLIPADST
+      highbd_fadst16_row_rect_many_neon,      // FLIPADST_ADST
+      highbd_fidentity16_row_rect_many_neon,  // IDTX
+      highbd_fidentity16_row_rect_many_neon,  // V_DCT
+      highbd_fdct16_row_rect_many_neon,       // H_DCT
+      highbd_fidentity16_row_rect_many_neon,  // V_ADST
+      highbd_fadst16_row_rect_many_neon,      // H_ADST
+      highbd_fidentity16_row_rect_many_neon,  // V_FLIPADST
+      highbd_fadst16_row_rect_many_neon       // H_FLIPADST
+    };
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+    row_highbd_txfm8_xn_arr[TX_TYPES] = {
+      highbd_fdct8_row_many_neon,       // DCT_DCT
+      highbd_fdct8_row_many_neon,       // ADST_DCT
+      highbd_fadst8_row_many_neon,      // DCT_ADST
+      highbd_fadst8_row_many_neon,      // ADST_ADST
+      highbd_fdct8_row_many_neon,       // FLIPADST_DCT
+      highbd_fadst8_row_many_neon,      // DCT_FLIPADST
+      highbd_fadst8_row_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst8_row_many_neon,      // ADST_FLIPADST
+      highbd_fadst8_row_many_neon,      // FLIPADST_ADST
+      highbd_fidentity8_row_many_neon,  // IDTX
+      highbd_fidentity8_row_many_neon,  // V_DCT
+      highbd_fdct8_row_many_neon,       // H_DCT
+      highbd_fidentity8_row_many_neon,  // V_ADST
+      highbd_fadst8_row_many_neon,      // H_ADST
+      highbd_fidentity8_row_many_neon,  // V_FLIPADST
+      highbd_fadst8_row_many_neon       // H_FLIPADST
+    };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
+      highbd_fdct8_row_rect_many_neon,       // DCT_DCT
+      highbd_fdct8_row_rect_many_neon,       // ADST_DCT
+      highbd_fadst8_row_rect_many_neon,      // DCT_ADST
+      highbd_fadst8_row_rect_many_neon,      // ADST_ADST
+      highbd_fdct8_row_rect_many_neon,       // FLIPADST_DCT
+      highbd_fadst8_row_rect_many_neon,      // DCT_FLIPADST
+      highbd_fadst8_row_rect_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst8_row_rect_many_neon,      // ADST_FLIPADST
+      highbd_fadst8_row_rect_many_neon,      // FLIPADST_ADST
+      highbd_fidentity8_row_rect_many_neon,  // IDTX
+      highbd_fidentity8_row_rect_many_neon,  // V_DCT
+      highbd_fdct8_row_rect_many_neon,       // H_DCT
+      highbd_fidentity8_row_rect_many_neon,  // V_ADST
+      highbd_fadst8_row_rect_many_neon,      // H_ADST
+      highbd_fidentity8_row_rect_many_neon,  // V_FLIPADST
+      highbd_fadst8_row_rect_many_neon       // H_FLIPADST
+    };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
+  highbd_fdct8_row_rect_neon,       // DCT_DCT
+  highbd_fdct8_row_rect_neon,       // ADST_DCT
+  highbd_fadst8_row_rect_neon,      // DCT_ADST
+  highbd_fadst8_row_rect_neon,      // ADST_ADST
+  highbd_fdct8_row_rect_neon,       // FLIPADST_DCT
+  highbd_fadst8_row_rect_neon,      // DCT_FLIPADST
+  highbd_fadst8_row_rect_neon,      // FLIPADST_FLIPADST
+  highbd_fadst8_row_rect_neon,      // ADST_FLIPADST
+  highbd_fadst8_row_rect_neon,      // FLIPADST_ADST
+  highbd_fidentity8_row_rect_neon,  // IDTX
+  highbd_fidentity8_row_rect_neon,  // V_DCT
+  highbd_fdct8_row_rect_neon,       // H_DCT
+  highbd_fidentity8_row_rect_neon,  // V_ADST
+  highbd_fadst8_row_rect_neon,      // H_ADST
+  highbd_fidentity8_row_rect_neon,  // V_FLIPADST
+  highbd_fadst8_row_rect_neon       // H_FLIPADST
+};
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+    row_highbd_txfm4_xn_arr[TX_TYPES] = {
+      highbd_fdct4_row_many_neon,       // DCT_DCT
+      highbd_fdct4_row_many_neon,       // ADST_DCT
+      highbd_fadst4_row_many_neon,      // DCT_ADST
+      highbd_fadst4_row_many_neon,      // ADST_ADST
+      highbd_fdct4_row_many_neon,       // FLIPADST_DCT
+      highbd_fadst4_row_many_neon,      // DCT_FLIPADST
+      highbd_fadst4_row_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst4_row_many_neon,      // ADST_FLIPADST
+      highbd_fadst4_row_many_neon,      // FLIPADST_ADST
+      highbd_fidentity4_row_many_neon,  // IDTX
+      highbd_fidentity4_row_many_neon,  // V_DCT
+      highbd_fdct4_row_many_neon,       // H_DCT
+      highbd_fidentity4_row_many_neon,  // V_ADST
+      highbd_fadst4_row_many_neon,      // H_ADST
+      highbd_fidentity4_row_many_neon,  // V_FLIPADST
+      highbd_fadst4_row_many_neon       // H_FLIPADST
+    };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
+      highbd_fdct4_row_rect_many_neon,       // DCT_DCT
+      highbd_fdct4_row_rect_many_neon,       // ADST_DCT
+      highbd_fadst4_row_rect_many_neon,      // DCT_ADST
+      highbd_fadst4_row_rect_many_neon,      // ADST_ADST
+      highbd_fdct4_row_rect_many_neon,       // FLIPADST_DCT
+      highbd_fadst4_row_rect_many_neon,      // DCT_FLIPADST
+      highbd_fadst4_row_rect_many_neon,      // FLIPADST_FLIPADST
+      highbd_fadst4_row_rect_many_neon,      // ADST_FLIPADST
+      highbd_fadst4_row_rect_many_neon,      // FLIPADST_ADST
+      highbd_fidentity4_row_rect_many_neon,  // IDTX
+      highbd_fidentity4_row_rect_many_neon,  // V_DCT
+      highbd_fdct4_row_rect_many_neon,       // H_DCT
+      highbd_fidentity4_row_rect_many_neon,  // V_ADST
+      highbd_fadst4_row_rect_many_neon,      // H_ADST
+      highbd_fidentity4_row_rect_many_neon,  // V_FLIPADST
+      highbd_fadst4_row_rect_many_neon       // H_FLIPADST
+    };
+
+static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
+                                  int cos_bit) {
+  const int32_t *const cospi = cospi_arr_s32(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // Workspaces for intermediate transform steps.
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+
+  // stage 1
+  butterfly_dct_pre(input, buf1, 32);
+
+  // stage 2
+  butterfly_dct_pre(buf1, buf0, 16);
   buf0[16] = buf1[16];
   buf0[17] = buf1[17];
   buf0[18] = buf1[18];
   buf0[19] = buf1[19];
-  btf_32_neon_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
-                    buf0[27], v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
-                    buf0[25], v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
-                    buf0[24], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
+                      v_cos_bit);
   buf0[28] = buf1[28];
   buf0[29] = buf1[29];
   buf0[30] = buf1[30];
   buf0[31] = buf1[31];
 
   // stage 3
-  cospi = cospi_arr(cos_bit);
-  buf1[0] = vaddq_s32(buf0[0], buf0[7]);
-  buf1[7] = vsubq_s32(buf0[0], buf0[7]);
-  buf1[1] = vaddq_s32(buf0[1], buf0[6]);
-  buf1[6] = vsubq_s32(buf0[1], buf0[6]);
-  buf1[2] = vaddq_s32(buf0[2], buf0[5]);
-  buf1[5] = vsubq_s32(buf0[2], buf0[5]);
-  buf1[3] = vaddq_s32(buf0[3], buf0[4]);
-  buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+  butterfly_dct_pre(buf0, buf1, 8);
   buf1[8] = buf0[8];
   buf1[9] = buf0[9];
-  btf_32_neon_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
-                    buf1[13], v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
-                    buf1[12], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
+                      v_cos_bit);
   buf1[14] = buf0[14];
   buf1[15] = buf0[15];
-  buf1[16] = vaddq_s32(buf0[16], buf0[23]);
-  buf1[23] = vsubq_s32(buf0[16], buf0[23]);
-  buf1[17] = vaddq_s32(buf0[17], buf0[22]);
-  buf1[22] = vsubq_s32(buf0[17], buf0[22]);
-  buf1[18] = vaddq_s32(buf0[18], buf0[21]);
-  buf1[21] = vsubq_s32(buf0[18], buf0[21]);
-  buf1[19] = vaddq_s32(buf0[19], buf0[20]);
-  buf1[20] = vsubq_s32(buf0[19], buf0[20]);
-  buf1[24] = vsubq_s32(buf0[31], buf0[24]);
-  buf1[31] = vaddq_s32(buf0[31], buf0[24]);
-  buf1[25] = vsubq_s32(buf0[30], buf0[25]);
-  buf1[30] = vaddq_s32(buf0[30], buf0[25]);
-  buf1[26] = vsubq_s32(buf0[29], buf0[26]);
-  buf1[29] = vaddq_s32(buf0[29], buf0[26]);
-  buf1[27] = vsubq_s32(buf0[28], buf0[27]);
-  buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
 
   // stage 4
-  cospi = cospi_arr(cos_bit);
-  buf0[0] = vaddq_s32(buf1[0], buf1[3]);
-  buf0[3] = vsubq_s32(buf1[0], buf1[3]);
-  buf0[1] = vaddq_s32(buf1[1], buf1[2]);
-  buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+  butterfly_dct_pre(buf1, buf0, 4);
   buf0[4] = buf1[4];
-  btf_32_neon_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
+                      v_cos_bit);
   buf0[7] = buf1[7];
-  buf0[8] = vaddq_s32(buf1[8], buf1[11]);
-  buf0[11] = vsubq_s32(buf1[8], buf1[11]);
-  buf0[9] = vaddq_s32(buf1[9], buf1[10]);
-  buf0[10] = vsubq_s32(buf1[9], buf1[10]);
-  buf0[12] = vsubq_s32(buf1[15], buf1[12]);
-  buf0[15] = vaddq_s32(buf1[15], buf1[12]);
-  buf0[13] = vsubq_s32(buf1[14], buf1[13]);
-  buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
   buf0[16] = buf1[16];
   buf0[17] = buf1[17];
-
-  btf_32_neon_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
-                    buf0[29], v_cos_bit);
-  btf_32_neon_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
-                    buf0[28], v_cos_bit);
-
-  btf_32_neon_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
-                    buf0[27], v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-
+  butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
+                      v_cos_bit);
   buf0[22] = buf1[22];
   buf0[23] = buf1[23];
   buf0[24] = buf1[24];
@@ -2126,72 +1600,46 @@
   buf0[31] = buf1[31];
 
   // stage 5
-  btf_32_neon_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
-                    v_cos_bit);
-
-  btf_32_neon_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
-                    v_cos_bit);
-  buf1[4] = vaddq_s32(buf0[4], buf0[5]);
-  buf1[5] = vsubq_s32(buf0[4], buf0[5]);
-  buf1[6] = vsubq_s32(buf0[7], buf0[6]);
-  buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+  butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
+                      v_cos_bit);
+  butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
   buf1[8] = buf0[8];
-  btf_32_neon_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
-                    buf1[13], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
+                      v_cos_bit);
   buf1[11] = buf0[11];
   buf1[12] = buf0[12];
   buf1[15] = buf0[15];
-  buf1[16] = vaddq_s32(buf0[16], buf0[19]);
-  buf1[19] = vsubq_s32(buf0[16], buf0[19]);
-  buf1[17] = vaddq_s32(buf0[17], buf0[18]);
-  buf1[18] = vsubq_s32(buf0[17], buf0[18]);
-  buf1[20] = vsubq_s32(buf0[23], buf0[20]);
-  buf1[23] = vaddq_s32(buf0[23], buf0[20]);
-  buf1[21] = vsubq_s32(buf0[22], buf0[21]);
-  buf1[22] = vaddq_s32(buf0[22], buf0[21]);
-  buf1[24] = vaddq_s32(buf0[24], buf0[27]);
-  buf1[27] = vsubq_s32(buf0[24], buf0[27]);
-  buf1[25] = vaddq_s32(buf0[25], buf0[26]);
-  buf1[26] = vsubq_s32(buf0[25], buf0[26]);
-  buf1[28] = vsubq_s32(buf0[31], buf0[28]);
-  buf1[31] = vaddq_s32(buf0[31], buf0[28]);
-  buf1[29] = vsubq_s32(buf0[30], buf0[29]);
-  buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
 
   // stage 6
-  cospi = cospi_arr(cos_bit);
   buf0[0] = buf1[0];
   buf0[1] = buf1[1];
   buf0[2] = buf1[2];
   buf0[3] = buf1[3];
 
-  btf_32_neon_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
-                    buf0[30], v_cos_bit);
-  btf_32_neon_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
-                    buf0[29], v_cos_bit);
-
-  buf0[8] = vaddq_s32(buf1[8], buf1[9]);
-  buf0[9] = vsubq_s32(buf1[8], buf1[9]);
-  buf0[10] = vsubq_s32(buf1[11], buf1[10]);
-  buf0[11] = vaddq_s32(buf1[11], buf1[10]);
-  buf0[12] = vaddq_s32(buf1[12], buf1[13]);
-  buf0[13] = vsubq_s32(buf1[12], buf1[13]);
-  buf0[14] = vsubq_s32(buf1[15], buf1[14]);
-  buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+  butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
+                      v_cos_bit);
+  butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
+                      v_cos_bit);
+  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+  butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
   buf0[16] = buf1[16];
   buf0[19] = buf1[19];
   buf0[20] = buf1[20];
 
-  btf_32_neon_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-  btf_32_neon_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
-                    buf0[25], v_cos_bit);
+  butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
+                      v_cos_bit);
+  butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
+                      v_cos_bit);
 
   buf0[23] = buf1[23];
   buf0[24] = buf1[24];
@@ -2200,7 +1648,6 @@
   buf0[31] = buf1[31];
 
   // stage 7
-  cospi = cospi_arr(cos_bit);
   buf1[0] = buf0[0];
   buf1[1] = buf0[1];
   buf1[2] = buf0[2];
@@ -2209,33 +1656,20 @@
   buf1[5] = buf0[5];
   buf1[6] = buf0[6];
   buf1[7] = buf0[7];
-  btf_32_neon_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
-                    buf1[13], v_cos_bit);
-  btf_32_neon_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
-                    buf1[12], v_cos_bit);
-  buf1[16] = vaddq_s32(buf0[16], buf0[17]);
-  buf1[17] = vsubq_s32(buf0[16], buf0[17]);
-  buf1[18] = vsubq_s32(buf0[19], buf0[18]);
-  buf1[19] = vaddq_s32(buf0[19], buf0[18]);
-  buf1[20] = vaddq_s32(buf0[20], buf0[21]);
-  buf1[21] = vsubq_s32(buf0[20], buf0[21]);
-  buf1[22] = vsubq_s32(buf0[23], buf0[22]);
-  buf1[23] = vaddq_s32(buf0[23], buf0[22]);
-  buf1[24] = vaddq_s32(buf0[24], buf0[25]);
-  buf1[25] = vsubq_s32(buf0[24], buf0[25]);
-  buf1[26] = vsubq_s32(buf0[27], buf0[26]);
-  buf1[27] = vaddq_s32(buf0[27], buf0[26]);
-  buf1[28] = vaddq_s32(buf0[28], buf0[29]);
-  buf1[29] = vsubq_s32(buf0[28], buf0[29]);
-  buf1[30] = vsubq_s32(buf0[31], buf0[30]);
-  buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+  butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
+                      v_cos_bit);
+  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+  butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+  butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
 
   // stage 8
-  cospi = cospi_arr(cos_bit);
   buf0[0] = buf1[0];
   buf0[1] = buf1[1];
   buf0[2] = buf1[2];
@@ -2252,313 +1686,70 @@
   buf0[13] = buf1[13];
   buf0[14] = buf1[14];
   buf0[15] = buf1[15];
-  btf_32_neon_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
-                    buf0[30], v_cos_bit);
-  btf_32_neon_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
-                    buf0[29], v_cos_bit);
-  btf_32_neon_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
-                    buf0[28], v_cos_bit);
-  btf_32_neon_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
-                    buf0[27], v_cos_bit);
-  btf_32_neon_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
-                    buf0[26], v_cos_bit);
-  btf_32_neon_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
-                    buf0[25], v_cos_bit);
-  btf_32_neon_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
+                      v_cos_bit);
+  butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
+                      v_cos_bit);
+  butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
+                      v_cos_bit);
 
-  startidx = 0 * stride;
-  endidx = 31 * stride;
   // stage 9
-  output[startidx] = buf0[0];
-  output[endidx] = buf0[31];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[16];
-  output[endidx] = buf0[15];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[8];
-  output[endidx] = buf0[23];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[24];
-  output[endidx] = buf0[7];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[4];
-  output[endidx] = buf0[27];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[20];
-  output[endidx] = buf0[11];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[12];
-  output[endidx] = buf0[19];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[28];
-  output[endidx] = buf0[3];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[2];
-  output[endidx] = buf0[29];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[18];
-  output[endidx] = buf0[13];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[10];
-  output[endidx] = buf0[21];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[26];
-  output[endidx] = buf0[5];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[6];
-  output[endidx] = buf0[25];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[22];
-  output[endidx] = buf0[9];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[14];
-  output[endidx] = buf0[17];
-  startidx += stride;
-  endidx -= stride;
-  output[startidx] = buf0[30];
-  output[endidx] = buf0[1];
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
 }
 
-void av1_fadst4_new_neon(const int32x4_t *input, int32x4_t *output,
-                         const int8_t cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 4;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-  int32x4_t buf0[4];
-  int32x4_t buf1[4];
-  int col_num = txfm_size / num_per_128;
-  int col;
-  (void)stage_range;
-  cospi = cospi_arr(cos_bit);
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int j;
-    for (j = 0; j < 4; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
+static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
+                                  int8_t cos_bit) {
+  const int32_t *const cospi = cospi_arr_s32(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
 
-    // stage 1
-    buf1[0] = buf0[3];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[1];
-    buf1[3] = buf0[2];
-
-    // stage 2
-    btf_32_neon_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
-                      v_cos_bit);
-    btf_32_neon_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3],
-                      v_cos_bit);
-
-    // stage 3
-    buf1[0] = vaddq_s32(buf0[0], buf0[2]);
-    buf1[2] = vsubq_s32(buf0[0], buf0[2]);
-    buf1[1] = vaddq_s32(buf0[1], buf0[3]);
-    buf1[3] = vsubq_s32(buf0[1], buf0[3]);
-
-    // stage 4
-    cospi = cospi_arr(cos_bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-
-    btf_32_neon_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
-                      v_cos_bit);
-
-    // stage 5
-    buf1[0] = buf0[0];
-    buf1[1] = vnegq_s32(buf0[2]);
-    buf1[2] = buf0[3];
-    buf1[3] = vnegq_s32(buf0[1]);
-
-    for (j = 0; j < 4; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
-
-static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
-                                           int32x4_t *x5, const int32_t *cospi,
-                                           const int32x4_t *v_cos_bit,
-                                           int *startidx, int *endidx) {
+  // stage 1
   int32x4_t x1[64];
-  x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
-  *startidx += instride;
-  *endidx -= instride;
-  x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
-  x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+  butterfly_dct_pre(input, x1, 64);
 
   // stage 2
   int32x4_t x2[64];
-  x2[0] = vaddq_s32(x1[0], x1[31]);
-  x2[31] = vsubq_s32(x1[0], x1[31]);
-  x2[1] = vaddq_s32(x1[1], x1[30]);
-  x2[30] = vsubq_s32(x1[1], x1[30]);
-  x2[2] = vaddq_s32(x1[2], x1[29]);
-  x2[29] = vsubq_s32(x1[2], x1[29]);
-  x2[3] = vaddq_s32(x1[3], x1[28]);
-  x2[28] = vsubq_s32(x1[3], x1[28]);
-  x2[4] = vaddq_s32(x1[4], x1[27]);
-  x2[27] = vsubq_s32(x1[4], x1[27]);
-  x2[5] = vaddq_s32(x1[5], x1[26]);
-  x2[26] = vsubq_s32(x1[5], x1[26]);
-  x2[6] = vaddq_s32(x1[6], x1[25]);
-  x2[25] = vsubq_s32(x1[6], x1[25]);
-  x2[7] = vaddq_s32(x1[7], x1[24]);
-  x2[24] = vsubq_s32(x1[7], x1[24]);
-  x2[8] = vaddq_s32(x1[8], x1[23]);
-  x2[23] = vsubq_s32(x1[8], x1[23]);
-  x2[9] = vaddq_s32(x1[9], x1[22]);
-  x2[22] = vsubq_s32(x1[9], x1[22]);
-  x2[10] = vaddq_s32(x1[10], x1[21]);
-  x2[21] = vsubq_s32(x1[10], x1[21]);
-  x2[11] = vaddq_s32(x1[11], x1[20]);
-  x2[20] = vsubq_s32(x1[11], x1[20]);
-  x2[12] = vaddq_s32(x1[12], x1[19]);
-  x2[19] = vsubq_s32(x1[12], x1[19]);
-  x2[13] = vaddq_s32(x1[13], x1[18]);
-  x2[18] = vsubq_s32(x1[13], x1[18]);
-  x2[14] = vaddq_s32(x1[14], x1[17]);
-  x2[17] = vsubq_s32(x1[14], x1[17]);
-  x2[15] = vaddq_s32(x1[15], x1[16]);
-  x2[16] = vsubq_s32(x1[15], x1[16]);
+  butterfly_dct_pre(x1, x2, 32);
   x2[32] = x1[32];
   x2[33] = x1[33];
   x2[34] = x1[34];
@@ -2567,23 +1758,14 @@
   x2[37] = x1[37];
   x2[38] = x1[38];
   x2[39] = x1[39];
-
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
-                    *v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
   x2[56] = x1[56];
   x2[57] = x1[57];
   x2[58] = x1[58];
@@ -2595,126 +1777,43 @@
 
   // stage 3
   int32x4_t x3[64];
-  x3[0] = vaddq_s32(x2[0], x2[15]);
-  x3[15] = vsubq_s32(x2[0], x2[15]);
-  x3[1] = vaddq_s32(x2[1], x2[14]);
-  x3[14] = vsubq_s32(x2[1], x2[14]);
-  x3[2] = vaddq_s32(x2[2], x2[13]);
-  x3[13] = vsubq_s32(x2[2], x2[13]);
-  x3[3] = vaddq_s32(x2[3], x2[12]);
-  x3[12] = vsubq_s32(x2[3], x2[12]);
-  x3[4] = vaddq_s32(x2[4], x2[11]);
-  x3[11] = vsubq_s32(x2[4], x2[11]);
-  x3[5] = vaddq_s32(x2[5], x2[10]);
-  x3[10] = vsubq_s32(x2[5], x2[10]);
-  x3[6] = vaddq_s32(x2[6], x2[9]);
-  x3[9] = vsubq_s32(x2[6], x2[9]);
-  x3[7] = vaddq_s32(x2[7], x2[8]);
-  x3[8] = vsubq_s32(x2[7], x2[8]);
+  butterfly_dct_pre(x2, x3, 16);
   x3[16] = x2[16];
   x3[17] = x2[17];
   x3[18] = x2[18];
   x3[19] = x2[19];
-  btf_32_neon_type0(-cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
-                    *v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
   x3[28] = x2[28];
   x3[29] = x2[29];
   x3[30] = x2[30];
   x3[31] = x2[31];
-  x3[32] = vaddq_s32(x2[32], x2[47]);
-  x3[47] = vsubq_s32(x2[32], x2[47]);
-  x3[33] = vaddq_s32(x2[33], x2[46]);
-  x3[46] = vsubq_s32(x2[33], x2[46]);
-  x3[34] = vaddq_s32(x2[34], x2[45]);
-  x3[45] = vsubq_s32(x2[34], x2[45]);
-  x3[35] = vaddq_s32(x2[35], x2[44]);
-  x3[44] = vsubq_s32(x2[35], x2[44]);
-  x3[36] = vaddq_s32(x2[36], x2[43]);
-  x3[43] = vsubq_s32(x2[36], x2[43]);
-  x3[37] = vaddq_s32(x2[37], x2[42]);
-  x3[42] = vsubq_s32(x2[37], x2[42]);
-  x3[38] = vaddq_s32(x2[38], x2[41]);
-  x3[41] = vsubq_s32(x2[38], x2[41]);
-  x3[39] = vaddq_s32(x2[39], x2[40]);
-  x3[40] = vsubq_s32(x2[39], x2[40]);
-  x3[48] = vsubq_s32(x2[63], x2[48]);
-  x3[63] = vaddq_s32(x2[63], x2[48]);
-  x3[49] = vsubq_s32(x2[62], x2[49]);
-  x3[62] = vaddq_s32(x2[62], x2[49]);
-  x3[50] = vsubq_s32(x2[61], x2[50]);
-  x3[61] = vaddq_s32(x2[61], x2[50]);
-  x3[51] = vsubq_s32(x2[60], x2[51]);
-  x3[60] = vaddq_s32(x2[60], x2[51]);
-  x3[52] = vsubq_s32(x2[59], x2[52]);
-  x3[59] = vaddq_s32(x2[59], x2[52]);
-  x3[53] = vsubq_s32(x2[58], x2[53]);
-  x3[58] = vaddq_s32(x2[58], x2[53]);
-  x3[54] = vsubq_s32(x2[57], x2[54]);
-  x3[57] = vaddq_s32(x2[57], x2[54]);
-  x3[55] = vsubq_s32(x2[56], x2[55]);
-  x3[56] = vaddq_s32(x2[56], x2[55]);
+  butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
 
   // stage 4
   int32x4_t x4[64];
-  x4[0] = vaddq_s32(x3[0], x3[7]);
-  x4[7] = vsubq_s32(x3[0], x3[7]);
-  x4[1] = vaddq_s32(x3[1], x3[6]);
-  x4[6] = vsubq_s32(x3[1], x3[6]);
-  x4[2] = vaddq_s32(x3[2], x3[5]);
-  x4[5] = vsubq_s32(x3[2], x3[5]);
-  x4[3] = vaddq_s32(x3[3], x3[4]);
-  x4[4] = vsubq_s32(x3[3], x3[4]);
+  butterfly_dct_pre(x3, x4, 8);
   x4[8] = x3[8];
   x4[9] = x3[9];
-  btf_32_neon_type0(-cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
-                    *v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
   x4[14] = x3[14];
   x4[15] = x3[15];
-  x4[16] = vaddq_s32(x3[16], x3[23]);
-  x4[23] = vsubq_s32(x3[16], x3[23]);
-  x4[17] = vaddq_s32(x3[17], x3[22]);
-  x4[22] = vsubq_s32(x3[17], x3[22]);
-  x4[18] = vaddq_s32(x3[18], x3[21]);
-  x4[21] = vsubq_s32(x3[18], x3[21]);
-  x4[19] = vaddq_s32(x3[19], x3[20]);
-  x4[20] = vsubq_s32(x3[19], x3[20]);
-  x4[24] = vsubq_s32(x3[31], x3[24]);
-  x4[31] = vaddq_s32(x3[31], x3[24]);
-  x4[25] = vsubq_s32(x3[30], x3[25]);
-  x4[30] = vaddq_s32(x3[30], x3[25]);
-  x4[26] = vsubq_s32(x3[29], x3[26]);
-  x4[29] = vaddq_s32(x3[29], x3[26]);
-  x4[27] = vsubq_s32(x3[28], x3[27]);
-  x4[28] = vaddq_s32(x3[28], x3[27]);
+  butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
   x4[32] = x3[32];
   x4[33] = x3[33];
   x4[34] = x3[34];
   x4[35] = x3[35];
-
-  btf_32_neon_type0(-cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x3[40], x3[55], x4[40], x4[55],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x3[41], x3[54], x4[41], x4[54],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x3[42], x3[53], x4[42], x4[53],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x3[43], x3[52], x4[43], x4[52],
-                    *v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
   x4[44] = x3[44];
   x4[45] = x3[45];
   x4[46] = x3[46];
@@ -2729,134 +1828,54 @@
   x4[63] = x3[63];
 
   // stage 5
-  x5[0] = vaddq_s32(x4[0], x4[3]);
-  x5[3] = vsubq_s32(x4[0], x4[3]);
-  x5[1] = vaddq_s32(x4[1], x4[2]);
-  x5[2] = vsubq_s32(x4[1], x4[2]);
+  int32x4_t x5[64];
+  butterfly_dct_pre(x4, x5, 4);
   x5[4] = x4[4];
-
-  btf_32_neon_type0(-cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
-                    *v_cos_bit);
+  butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
   x5[7] = x4[7];
-  x5[8] = vaddq_s32(x4[8], x4[11]);
-  x5[11] = vsubq_s32(x4[8], x4[11]);
-  x5[9] = vaddq_s32(x4[9], x4[10]);
-  x5[10] = vsubq_s32(x4[9], x4[10]);
-  x5[12] = vsubq_s32(x4[15], x4[12]);
-  x5[15] = vaddq_s32(x4[15], x4[12]);
-  x5[13] = vsubq_s32(x4[14], x4[13]);
-  x5[14] = vaddq_s32(x4[14], x4[13]);
+  butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
   x5[16] = x4[16];
   x5[17] = x4[17];
-
-  btf_32_neon_type0(-cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x4[20], x4[27], x5[20], x5[27],
-                    *v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x4[21], x4[26], x5[21], x5[26],
-                    *v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
   x5[22] = x4[22];
   x5[23] = x4[23];
   x5[24] = x4[24];
   x5[25] = x4[25];
   x5[30] = x4[30];
   x5[31] = x4[31];
-  x5[32] = vaddq_s32(x4[32], x4[39]);
-  x5[39] = vsubq_s32(x4[32], x4[39]);
-  x5[33] = vaddq_s32(x4[33], x4[38]);
-  x5[38] = vsubq_s32(x4[33], x4[38]);
-  x5[34] = vaddq_s32(x4[34], x4[37]);
-  x5[37] = vsubq_s32(x4[34], x4[37]);
-  x5[35] = vaddq_s32(x4[35], x4[36]);
-  x5[36] = vsubq_s32(x4[35], x4[36]);
-  x5[40] = vsubq_s32(x4[47], x4[40]);
-  x5[47] = vaddq_s32(x4[47], x4[40]);
-  x5[41] = vsubq_s32(x4[46], x4[41]);
-  x5[46] = vaddq_s32(x4[46], x4[41]);
-  x5[42] = vsubq_s32(x4[45], x4[42]);
-  x5[45] = vaddq_s32(x4[45], x4[42]);
-  x5[43] = vsubq_s32(x4[44], x4[43]);
-  x5[44] = vaddq_s32(x4[44], x4[43]);
-  x5[48] = vaddq_s32(x4[48], x4[55]);
-  x5[55] = vsubq_s32(x4[48], x4[55]);
-  x5[49] = vaddq_s32(x4[49], x4[54]);
-  x5[54] = vsubq_s32(x4[49], x4[54]);
-  x5[50] = vaddq_s32(x4[50], x4[53]);
-  x5[53] = vsubq_s32(x4[50], x4[53]);
-  x5[51] = vaddq_s32(x4[51], x4[52]);
-  x5[52] = vsubq_s32(x4[51], x4[52]);
-  x5[56] = vsubq_s32(x4[63], x4[56]);
-  x5[63] = vaddq_s32(x4[63], x4[56]);
-  x5[57] = vsubq_s32(x4[62], x4[57]);
-  x5[62] = vaddq_s32(x4[62], x4[57]);
-  x5[58] = vsubq_s32(x4[61], x4[58]);
-  x5[61] = vaddq_s32(x4[61], x4[58]);
-  x5[59] = vsubq_s32(x4[60], x4[59]);
-  x5[60] = vaddq_s32(x4[60], x4[59]);
-}
-
-static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
-                                int8_t cos_bit, const int instride,
-                                const int outstride) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
-
-  int startidx = 0 * instride;
-  int endidx = 63 * instride;
-
-  // stage 1-2-3-4-5
-  int32x4_t x5[64];
-  av1_fdct64_new_stage12345_neon(input, instride, x5, cospi, &v_cos_bit,
-                                 &startidx, &endidx);
+  butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
+  butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
 
   // stage 6
   int32x4_t x6[64];
-  btf_32_neon_type0(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
-                    v_cos_bit);
-  x6[4] = vaddq_s32(x5[4], x5[5]);
-  x6[5] = vsubq_s32(x5[4], x5[5]);
-  x6[6] = vsubq_s32(x5[7], x5[6]);
-  x6[7] = vaddq_s32(x5[7], x5[6]);
+  butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
+  butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
   x6[8] = x5[8];
-  btf_32_neon_type0(-cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[48], -cospi[16], x5[10], x5[13], x6[10], x6[13],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
+  butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
   x6[11] = x5[11];
   x6[12] = x5[12];
   x6[15] = x5[15];
-  x6[16] = vaddq_s32(x5[16], x5[19]);
-  x6[19] = vsubq_s32(x5[16], x5[19]);
-  x6[17] = vaddq_s32(x5[17], x5[18]);
-  x6[18] = vsubq_s32(x5[17], x5[18]);
-  x6[20] = vsubq_s32(x5[23], x5[20]);
-  x6[23] = vaddq_s32(x5[23], x5[20]);
-  x6[21] = vsubq_s32(x5[22], x5[21]);
-  x6[22] = vaddq_s32(x5[22], x5[21]);
-  x6[24] = vaddq_s32(x5[24], x5[27]);
-  x6[27] = vsubq_s32(x5[24], x5[27]);
-  x6[25] = vaddq_s32(x5[25], x5[26]);
-  x6[26] = vsubq_s32(x5[25], x5[26]);
-  x6[28] = vsubq_s32(x5[31], x5[28]);
-  x6[31] = vaddq_s32(x5[31], x5[28]);
-  x6[29] = vsubq_s32(x5[30], x5[29]);
-  x6[30] = vaddq_s32(x5[30], x5[29]);
+  butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
+  butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
   x6[32] = x5[32];
   x6[33] = x5[33];
-
-  btf_32_neon_type0(-cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[24], -cospi[40], x5[44], x5[51], x6[44], x6[51],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[24], -cospi[40], x5[45], x5[50], x6[45], x6[50],
-                    v_cos_bit);
-
+  butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
+  butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
+  butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
+  butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
+  butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
+  butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
+  butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
   x6[46] = x5[46];
   x6[47] = x5[47];
   x6[48] = x5[48];
@@ -2874,82 +1893,26 @@
   x7[1] = x6[1];
   x7[2] = x6[2];
   x7[3] = x6[3];
-  btf_32_neon_type1(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
-                    v_cos_bit);
-
-  x7[8] = vaddq_s32(x6[8], x6[9]);
-  x7[9] = vsubq_s32(x6[8], x6[9]);
-  x7[10] = vsubq_s32(x6[11], x6[10]);
-  x7[11] = vaddq_s32(x6[11], x6[10]);
-  x7[12] = vaddq_s32(x6[12], x6[13]);
-  x7[13] = vsubq_s32(x6[12], x6[13]);
-  x7[14] = vsubq_s32(x6[15], x6[14]);
-  x7[15] = vaddq_s32(x6[15], x6[14]);
+  butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
+  butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
+  butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
+  butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
   x7[16] = x6[16];
-
-  btf_32_neon_type0(-cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[24], -cospi[40], x6[22], x6[25], x7[22], x7[25],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
+  butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
+  butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
   x7[23] = x6[23];
   x7[24] = x6[24];
   x7[27] = x6[27];
   x7[28] = x6[28];
   x7[31] = x6[31];
-
-  btf_32_neon_type0(-cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[56], -cospi[8], x5[36], x5[59], x6[36], x6[59],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[56], -cospi[8], x5[37], x5[58], x6[37], x6[58],
-                    v_cos_bit);
-  x6[38] = x5[38];
-  x6[39] = x5[39];
-  x6[40] = x5[40];
-  x6[41] = x5[41];
-
-  btf_32_neon_type1(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
-  btf_32_neon_type0(-cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[56], -cospi[8], x6[18], x6[29], x7[18], x7[29],
-                    v_cos_bit);
-  x7[19] = x6[19];
-  x7[20] = x6[20];
-
-  x7[32] = vaddq_s32(x6[32], x6[35]);
-  x7[35] = vsubq_s32(x6[32], x6[35]);
-  x7[33] = vaddq_s32(x6[33], x6[34]);
-  x7[34] = vsubq_s32(x6[33], x6[34]);
-  x7[36] = vsubq_s32(x6[39], x6[36]);
-  x7[39] = vaddq_s32(x6[39], x6[36]);
-  x7[37] = vsubq_s32(x6[38], x6[37]);
-  x7[38] = vaddq_s32(x6[38], x6[37]);
-  x7[40] = vaddq_s32(x6[40], x6[43]);
-  x7[43] = vsubq_s32(x6[40], x6[43]);
-  x7[41] = vaddq_s32(x6[41], x6[42]);
-  x7[42] = vsubq_s32(x6[41], x6[42]);
-  x7[44] = vsubq_s32(x6[47], x6[44]);
-  x7[47] = vaddq_s32(x6[47], x6[44]);
-  x7[45] = vsubq_s32(x6[46], x6[45]);
-  x7[46] = vaddq_s32(x6[46], x6[45]);
-  x7[48] = vaddq_s32(x6[48], x6[51]);
-  x7[51] = vsubq_s32(x6[48], x6[51]);
-  x7[49] = vaddq_s32(x6[49], x6[50]);
-  x7[50] = vsubq_s32(x6[49], x6[50]);
-  x7[52] = vsubq_s32(x6[55], x6[52]);
-  x7[55] = vaddq_s32(x6[55], x6[52]);
-  x7[53] = vsubq_s32(x6[54], x6[53]);
-  x7[54] = vaddq_s32(x6[54], x6[53]);
-  x7[56] = vaddq_s32(x6[56], x6[59]);
-  x7[59] = vsubq_s32(x6[56], x6[59]);
-  x7[57] = vaddq_s32(x6[57], x6[58]);
-  x7[58] = vsubq_s32(x6[57], x6[58]);
-  x7[60] = vsubq_s32(x6[63], x6[60]);
-  x7[63] = vaddq_s32(x6[63], x6[60]);
-  x7[61] = vsubq_s32(x6[62], x6[61]);
-  x7[62] = vaddq_s32(x6[62], x6[61]);
+  butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
+  butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
+  butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
+  butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
 
   // stage 8
   int32x4_t x8[64];
@@ -2962,54 +1925,29 @@
   x8[6] = x7[6];
   x8[7] = x7[7];
 
-  btf_32_neon_type1(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
-                    v_cos_bit);
-  x8[16] = vaddq_s32(x7[16], x7[17]);
-  x8[17] = vsubq_s32(x7[16], x7[17]);
-  x8[18] = vsubq_s32(x7[19], x7[18]);
-  x8[19] = vaddq_s32(x7[19], x7[18]);
-  x8[20] = vaddq_s32(x7[20], x7[21]);
-  x8[21] = vsubq_s32(x7[20], x7[21]);
-  x8[22] = vsubq_s32(x7[23], x7[22]);
-  x8[23] = vaddq_s32(x7[23], x7[22]);
-  x8[24] = vaddq_s32(x7[24], x7[25]);
-  x8[25] = vsubq_s32(x7[24], x7[25]);
-  x8[26] = vsubq_s32(x7[27], x7[26]);
-  x8[27] = vaddq_s32(x7[27], x7[26]);
-  x8[28] = vaddq_s32(x7[28], x7[29]);
-  x8[29] = vsubq_s32(x7[28], x7[29]);
-  x8[30] = vsubq_s32(x7[31], x7[30]);
-  x8[31] = vaddq_s32(x7[31], x7[30]);
+  butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
+  butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
+  butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
+  butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
+  butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
+  butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
+  butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
+  butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
   x8[32] = x7[32];
-
-  btf_32_neon_type0(-cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[60], -cospi[4], x7[34], x7[61], x8[34], x8[61],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
+  butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
   x8[35] = x7[35];
   x8[36] = x7[36];
-  btf_32_neon_type0(-cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[28], -cospi[36], x7[38], x7[57], x8[38], x8[57],
-                    v_cos_bit);
+  butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
+  butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
   x8[39] = x7[39];
   x8[40] = x7[40];
-  btf_32_neon_type0(-cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[44], -cospi[20], x7[42], x7[53], x8[42], x8[53],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
+  butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
   x8[43] = x7[43];
   x8[44] = x7[44];
-  btf_32_neon_type0(-cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
-                    v_cos_bit);
-  btf_32_neon_type0(-cospi[12], -cospi[52], x7[46], x7[49], x8[46], x8[49],
-                    v_cos_bit);
+  butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
+  butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
   x8[47] = x7[47];
   x8[48] = x7[48];
   x8[51] = x7[51];
@@ -3038,56 +1976,22 @@
   x9[13] = x8[13];
   x9[14] = x8[14];
   x9[15] = x8[15];
-
-  btf_32_neon_type1(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
-                    v_cos_bit);
-
-  x9[32] = vaddq_s32(x8[32], x8[33]);
-  x9[33] = vsubq_s32(x8[32], x8[33]);
-  x9[34] = vsubq_s32(x8[35], x8[34]);
-  x9[35] = vaddq_s32(x8[35], x8[34]);
-  x9[36] = vaddq_s32(x8[36], x8[37]);
-  x9[37] = vsubq_s32(x8[36], x8[37]);
-  x9[38] = vsubq_s32(x8[39], x8[38]);
-  x9[39] = vaddq_s32(x8[39], x8[38]);
-  x9[40] = vaddq_s32(x8[40], x8[41]);
-  x9[41] = vsubq_s32(x8[40], x8[41]);
-  x9[42] = vsubq_s32(x8[43], x8[42]);
-  x9[43] = vaddq_s32(x8[43], x8[42]);
-  x9[44] = vaddq_s32(x8[44], x8[45]);
-  x9[45] = vsubq_s32(x8[44], x8[45]);
-  x9[46] = vsubq_s32(x8[47], x8[46]);
-  x9[47] = vaddq_s32(x8[47], x8[46]);
-  x9[48] = vaddq_s32(x8[48], x8[49]);
-  x9[49] = vsubq_s32(x8[48], x8[49]);
-  x9[50] = vsubq_s32(x8[51], x8[50]);
-  x9[51] = vaddq_s32(x8[51], x8[50]);
-  x9[52] = vaddq_s32(x8[52], x8[53]);
-  x9[53] = vsubq_s32(x8[52], x8[53]);
-  x9[54] = vsubq_s32(x8[55], x8[54]);
-  x9[55] = vaddq_s32(x8[55], x8[54]);
-  x9[56] = vaddq_s32(x8[56], x8[57]);
-  x9[57] = vsubq_s32(x8[56], x8[57]);
-  x9[58] = vsubq_s32(x8[59], x8[58]);
-  x9[59] = vaddq_s32(x8[59], x8[58]);
-  x9[60] = vaddq_s32(x8[60], x8[61]);
-  x9[61] = vsubq_s32(x8[60], x8[61]);
-  x9[62] = vsubq_s32(x8[63], x8[62]);
-  x9[63] = vaddq_s32(x8[63], x8[62]);
+  butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
+  butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
+  butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
+  butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
+  butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
+  butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
+  butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
+  butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
+  butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
+  butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
+  butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
+  butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
+  butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
+  butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
+  butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
+  butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
 
   // stage 10
   int32x4_t x10[64];
@@ -3123,903 +2027,593 @@
   x10[29] = x9[29];
   x10[30] = x9[30];
   x10[31] = x9[31];
-  btf_32_neon_type1(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
-                    v_cos_bit);
-  btf_32_neon_type1(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
-                    v_cos_bit);
+  butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
+  butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
+  butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
+  butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
+  butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
+  butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
+  butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
+  butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
+  butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
+  butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
+  butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
+  butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
+  butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
+  butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
+  butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
+  butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
 
-  startidx = 0 * outstride;
-  endidx = 63 * outstride;
   // stage 11
-  output[startidx] = x10[0];
-  output[endidx] = x10[63];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[32];
-  output[endidx] = x10[31];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[16];
-  output[endidx] = x10[47];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[48];
-  output[endidx] = x10[15];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[8];
-  output[endidx] = x10[55];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[40];
-  output[endidx] = x10[23];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[24];
-  output[endidx] = x10[39];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[56];
-  output[endidx] = x10[7];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[4];
-  output[endidx] = x10[59];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[36];
-  output[endidx] = x10[27];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[20];
-  output[endidx] = x10[43];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[52];
-  output[endidx] = x10[11];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[12];
-  output[endidx] = x10[51];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[44];
-  output[endidx] = x10[19];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[28];
-  output[endidx] = x10[35];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[60];
-  output[endidx] = x10[3];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[2];
-  output[endidx] = x10[61];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[34];
-  output[endidx] = x10[29];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[18];
-  output[endidx] = x10[45];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[50];
-  output[endidx] = x10[13];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[10];
-  output[endidx] = x10[53];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[42];
-  output[endidx] = x10[21];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[26];
-  output[endidx] = x10[37];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[58];
-  output[endidx] = x10[5];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[6];
-  output[endidx] = x10[57];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[38];
-  output[endidx] = x10[25];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[22];
-  output[endidx] = x10[41];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[54];
-  output[endidx] = x10[9];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[14];
-  output[endidx] = x10[49];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[46];
-  output[endidx] = x10[17];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[30];
-  output[endidx] = x10[33];
-  startidx += outstride;
-  endidx -= outstride;
-  output[startidx] = x10[62];
-  output[endidx] = x10[1];
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
 }
 
-void av1_idtx32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
-                         const int col_num) {
+static void highbd_fidentity32_x4_neon(const int32x4_t *input,
+                                       int32x4_t *output, int cos_bit) {
   (void)cos_bit;
   for (int i = 0; i < 32; i++) {
-    output[i * col_num] = vshlq_n_s32(input[i * col_num], 2);
+    output[i] = vshlq_n_s32(input[i], 2);
   }
 }
 
-static const fwd_transform_1d_neon col_highbd_txfm8x32_arr[TX_TYPES] = {
-  av1_fdct32_new_neon,  // DCT_DCT
-  NULL,                 // ADST_DCT
-  NULL,                 // DCT_ADST
-  NULL,                 // ADST_ADST
-  NULL,                 // FLIPADST_DCT
-  NULL,                 // DCT_FLIPADST
-  NULL,                 // FLIPADST_FLIPADST
-  NULL,                 // ADST_FLIPADST
-  NULL,                 // FLIPADST_ADST
-  av1_idtx32_new_neon,  // IDTX
-  NULL,                 // V_DCT
-  NULL,                 // H_DCT
-  NULL,                 // V_ADST
-  NULL,                 // H_ADST
-  NULL,                 // V_FLIPADST
-  NULL                  // H_FLIPADST
-};
+TRANSFORM_COL_MANY(fdct32, 32)
+TRANSFORM_COL_MANY(fidentity32, 32)
 
-static const fwd_transform_1d_neon row_highbd_txfm8x32_arr[TX_TYPES] = {
-  fdct16x16_neon,  // DCT_DCT
-  NULL,            // ADST_DCT
-  NULL,            // DCT_ADST
-  NULL,            // ADST_ADST
-  NULL,            // FLIPADST_DCT
-  NULL,            // DCT_FLIPADST
-  NULL,            // FLIPADST_FLIPADST
-  NULL,            // ADST_FLIPADST
-  NULL,            // FLIPADST_ADST
-  idtx16x16_neon,  // IDTX
-  NULL,            // V_DCT
-  NULL,            // H_DCT
-  NULL,            // V_ADST
-  NULL,            // H_ADST
-  NULL,            // V_FLIPADST
-  NULL             // H_FLIPADST
-};
+static const fwd_transform_1d_col_many_neon
+    col_highbd_txfm32_x4_arr[TX_TYPES] = {
+      highbd_fdct32_col_many_neon,       // DCT_DCT
+      NULL,                              // ADST_DCT
+      NULL,                              // DCT_ADST
+      NULL,                              // ADST_ADST
+      NULL,                              // FLIPADST_DCT
+      NULL,                              // DCT_FLIPADST
+      NULL,                              // FLIPADST_FLIPADST
+      NULL,                              // ADST_FLIPADST
+      NULL,                              // FLIPADST_ADST
+      highbd_fidentity32_col_many_neon,  // IDTX
+      NULL,                              // V_DCT
+      NULL,                              // H_DCT
+      NULL,                              // V_ADST
+      NULL,                              // H_ADST
+      NULL,                              // V_FLIPADST
+      NULL                               // H_FLIPADST
+    };
+
+TRANSFORM_ROW_MANY(fdct32, 32)
+TRANSFORM_ROW_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+    row_highbd_txfm32_x4_arr[TX_TYPES] = {
+      highbd_fdct32_row_many_neon,       // DCT_DCT
+      NULL,                              // ADST_DCT
+      NULL,                              // DCT_ADST
+      NULL,                              // ADST_ADST
+      NULL,                              // FLIPADST_DCT
+      NULL,                              // DCT_FLIPADST
+      NULL,                              // FLIPADST_FLIPADST
+      NULL,                              // ADST_FLIPADST
+      NULL,                              // FLIPADST_ADST
+      highbd_fidentity32_row_many_neon,  // IDTX
+      NULL,                              // V_DCT
+      NULL,                              // H_DCT
+      NULL,                              // V_ADST
+      NULL,                              // H_ADST
+      NULL,                              // V_FLIPADST
+      NULL                               // H_FLIPADST
+    };
+
+TRANSFORM_ROW_RECT_MANY(fdct32, 32)
+TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+    row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
+      highbd_fdct32_row_rect_many_neon,       // DCT_DCT
+      NULL,                                   // ADST_DCT
+      NULL,                                   // DCT_ADST
+      NULL,                                   // ADST_ADST
+      NULL,                                   // FLIPADST_DCT
+      NULL,                                   // DCT_FLIPADST
+      NULL,                                   // FLIPADST_FLIPADST
+      NULL,                                   // ADST_FLIPADST
+      NULL,                                   // FLIPADST_ADST
+      highbd_fidentity32_row_rect_many_neon,  // IDTX
+      NULL,                                   // V_DCT
+      NULL,                                   // H_DCT
+      NULL,                                   // V_ADST
+      NULL,                                   // H_ADST
+      NULL,                                   // V_FLIPADST
+      NULL                                    // H_FLIPADST
+    };
 
 void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
   (void)bd;
-  int32x4_t in[32], out[32];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
-  const int txw_idx = get_txw_idx(TX_16X8);
-  const int txh_idx = get_txh_idx(TX_16X8);
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x8_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
-  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm8_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm16_xn_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[2][1];
+
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  for (int i = 0; i < 2; i++) {
-    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
-    col_txfm(in, in, bit, 2);
-    col_txfm_8x8_rounding(in, &v_shift1);
-    transpose_8x8(in, out + i * 16);
-  }
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
 
+  // Column-wise transform.
+  int32x4_t buf0[32];
   if (lr_flip) {
-    flip_buf_neon(in, out, 32);
-    row_txfm(in, out, bit, 2);
+    col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
+             /*hm_stride=*/-8);
   } else {
-    row_txfm(out, out, bit, 2);
+    col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
+             /*hm_stride=*/8);
   }
+  shift_right_2_round_s32_x4(buf0, buf0, 32);
 
-  for (int i = 0; i < 2; i++) {
-    av1_round_shift_rect_array_32_neon(out + i * 16, in, 16, -shift[2],
-                                       NewSqrt2);
-    write_buffer_8x8(in, coeff + i * 64);
-  }
+  int32x4_t buf1[32];
+  transpose_arrays_s32_16x8(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
 }
 
 void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
   (void)bd;
-  int32x4_t in[32], out[32];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
-  const int txw_idx = get_txw_idx(TX_8X16);
-  const int txh_idx = get_txh_idx(TX_8X16);
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x8_arr[tx_type];
-  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm16_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm8_xn_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[1][2];
+
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
 
-  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
-  col_txfm(in, in, bit, 2);
-  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  col_txfm_8x16_rounding(in, &v_shift1);
-  transpose_8x8(in, out);
-  transpose_8x8(in + 16, out + 16);
-
-  for (int i = 0; i < 2; i++) {
-    row_txfm(out + i * 16, out, bit, 2);
-    av1_round_shift_rect_array_32_neon(out, out, 16, -shift[2], NewSqrt2);
-    write_buffer_16x8(out, coeff + i * 8, 16);
+  // Column-wise transform.
+  int32x4_t buf0[32];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
+             /*hm_stride=*/-16);
+  } else {
+    col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
+             /*hm_stride=*/16);
   }
-}
+  shift_right_2_round_s32_x4(buf0, buf0, 32);
 
-static INLINE void transpose_8nx8n(const int32x4_t *input, int32x4_t *output,
-                                   const int width, const int height) {
-  const int numcol = height >> 2;
-  const int numrow = width >> 2;
-  for (int j = 0; j < numrow; j++) {
-    for (int i = 0; i < numcol; i++) {
-      TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
-                    input[i * width + j + (numrow * 1)],
-                    input[i * width + j + (numrow * 2)],
-                    input[i * width + j + (numrow * 3)],
-                    output[j * height + i + (numcol * 0)],
-                    output[j * height + i + (numcol * 1)],
-                    output[j * height + i + (numcol * 2)],
-                    output[j * height + i + (numcol * 3)]);
-    }
-  }
+  int32x4_t buf1[32];
+  transpose_arrays_s32_8x16(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
 }
 
 #if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
   (void)bd;
-
-  int32x4_t in[16];
-  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
-  const int txw_idx = get_txw_idx(TX_4X16);
-  const int txh_idx = get_txh_idx(TX_4X16);
-  const int txfm_size_col = tx_size_wide[TX_4X16];
-  const int txfm_size_row = tx_size_high[TX_4X16];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[0][2];
+  int bitrow = av1_fwd_cos_bit_row[0][2];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm16_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm4_xn_arr[tx_type];
 
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  // col transform
-  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, &v_shift0);
-  col_txfm(in, outcoeff128, bitcol, 1);
-  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  col_txfm_8x8_rounding(outcoeff128, &v_shift1);
-  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
 
-  // row transform
-  for (int i = 0; i < txfm_size_col; i++) {
-    int32x4_t tmp[4];
-    row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
-    store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
+  // Column-wise transform.
+  int32x4_t buf0[16];
+  if (lr_flip) {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
+             /*hm_stride=*/0);
+  } else {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
+             /*hm_stride=*/0);
   }
+  shift_right_1_round_s32_x4(buf0, buf0, 16);
+
+  int32x4_t buf1[16];
+  transpose_arrays_s32_4x16(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
 }
 #endif
 
 void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
   (void)bd;
+  int bitcol = av1_fwd_cos_bit_col[2][0];
+  int bitrow = av1_fwd_cos_bit_row[2][0];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm4_xn_arr[tx_type];
+  const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
 
-  int32x4_t in[16];
-  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
-  const int txw_idx = get_txw_idx(TX_16X4);
-  const int txh_idx = get_txh_idx(TX_16X4);
-  const int txfm_size_col = tx_size_wide[TX_16X4];
-  const int txfm_size_row = tx_size_high[TX_16X4];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
 
-  // col transform
-  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  load_buffer_16x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
-
-  for (int i = 0; i < (txfm_size_col >> 2); i++) {
-    int32x4_t *cur_in = &in[i * txfm_size_row];
-    col_txfm(cur_in, cur_in, bitcol, 1);
-    transpose_4x4(cur_in, cur_in);
+  // Column-wise transform.
+  int32x4_t buf0[16];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
+             /*hm_stride=*/-4);
+  } else {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+             /*hm_stride=*/4);
   }
-  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  col_txfm_8x8_rounding(in, &v_shift1);
 
-  // row transform
-  row_txfm(in, outcoeff128, bitrow, 1);
+  shift_right_1_round_s32_x4(buf0, buf0, 16);
+  transpose_arrays_s32_4x16(buf0, buf0);
+
+  // Row-wise transform.
+  row_txfm(buf0, coeff, bitrow, /*stride=*/4);
 }
 
 void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm32_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm16_xn_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[2][3];
+  int bitrow = av1_fwd_cos_bit_row[2][3];
 
-  int32x4_t in[128];
-  int32x4_t *outcoef128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
-  const int txw_idx = get_txw_idx(TX_16X32);
-  const int txh_idx = get_txh_idx(TX_16X32);
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x32_arr[tx_type];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  // Column-wise transform.
+  int32x4_t buf0[128];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+           /*hm_stride=*/32);
+  shift_right_4_round_s32_x4(buf0, buf0, 128);
 
-  // column transform
-  load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-  load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+  int32x4_t buf1[128];
+  transpose_arrays_s32_16x32(buf0, buf1);
 
-  for (int i = 0; i < 4; i++) {
-    col_txfm((in + i), (in + i), bitcol, 4);
-  }
-
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  col_txfm_16x16_rounding(&in[0], &v_shift);
-  col_txfm_16x16_rounding(&in[64], &v_shift);
-  transpose_8nx8n(in, outcoef128, 16, 32);
-
-  // row transform
-  row_txfm(outcoef128, in, bitrow, 8);
-  av1_round_shift_rect_array_32_neon(in, outcoef128, 128, -shift[2], NewSqrt2);
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
 }
 
 void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
-  (void)tx_type;
   (void)bd;
+  (void)tx_type;
+  int bitcol = av1_fwd_cos_bit_col[3][4];
+  int bitrow = av1_fwd_cos_bit_row[3][4];
 
-  int32x4_t in[512];
-  int32x4_t *outcoef128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
-  const int txw_idx = get_txw_idx(TX_32X64);
-  const int txh_idx = get_txh_idx(TX_32X64);
-  const int txfm_size_col = tx_size_wide[TX_32X64];
-  const int txfm_size_row = tx_size_high[TX_32X64];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int num_row = txfm_size_row >> 2;
-  const int num_col = txfm_size_col >> 2;
+  // Column-wise transform.
+  int32x4_t buf0[512];
+  load_buffer_32x64(input, buf0, stride, 0);
+  for (int i = 0; i < 8; i++) {
+    highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 512);
 
-  // column transform
-  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
-  for (int i = 0; i < num_col; i++) {
-    av1_fdct64_new_neon((in + i), (in + i), bitcol, num_col, num_col);
-  }
+  int32x4_t buf1[512];
+  transpose_arrays_s32_32x64(buf0, buf1);
 
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  for (int i = 0; i < num_col; i++) {
-    col_txfm_16x16_rounding((in + i * txfm_size_row), &v_shift);
+  // Row-wise transform.
+  for (int i = 0; i < 16; i++) {
+    highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
   }
-  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
-
-  // row transform
-  for (int i = 0; i < num_row; i++) {
-    av1_fdct32_new_neon((outcoef128 + i), (in + i), bitrow, num_row);
-  }
-  for (int i = 0; i < txfm_size_col; i++) {
-    av1_round_shift_rect_array_32_neon(in + i * 16, outcoef128 + i * 8, 8,
-                                       -shift[2], NewSqrt2);
-  }
+  round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+  store_buffer_32x32(buf1, coeff, /*stride=*/32);
 }
 
 void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
-  (void)tx_type;
-  int32x4_t in[512];
-  int32x4_t *outcoef128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
-  const int txw_idx = get_txw_idx(TX_64X32);
-  const int txh_idx = get_txh_idx(TX_64X32);
-  const int txfm_size_col = tx_size_wide[TX_64X32];
-  const int txfm_size_row = tx_size_high[TX_64X32];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const int num_row = txfm_size_row >> 2;
-  const int num_col = txfm_size_col >> 2;
-
-  // column transform
-  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  for (int i = 0; i < 32; i++) {
-    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0,
-                    &v_shift0);
-    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
-                    &v_shift0);
-    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
-                    &v_shift0);
-    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
-                    &v_shift0);
-  }
-
-  for (int i = 0; i < num_col; i++) {
-    av1_fdct32_new_neon((in + i), (in + i), bitcol, num_col);
-  }
-
-  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  for (int i = 0; i < num_row; i++) {
-    col_txfm_16x16_rounding((in + i * txfm_size_col), &v_shift1);
-  }
-  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
-
-  // row transform
-  for (int i = 0; i < num_row; i++) {
-    av1_fdct64_new_neon((outcoef128 + i), (in + i), bitrow, num_row, num_row);
-  }
-  av1_round_shift_rect_array_32_neon(in, outcoef128, 512, -shift[2], NewSqrt2);
   (void)bd;
+  (void)tx_type;
+  int bitcol = av1_fwd_cos_bit_col[4][3];
+  int bitrow = av1_fwd_cos_bit_row[4][3];
+
+  // Column-wise transform.
+  int32x4_t buf0[512];
+  load_buffer_64x32(input, buf0, stride, 0);
+  for (int i = 0; i < 16; i++) {
+    highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
+  }
+  shift_right_4_round_s32_x4(buf0, buf0, 512);
+
+  int32x4_t buf1[512];
+  transpose_arrays_s32_64x32(buf0, buf1);
+
+  // Row-wise transform.
+  for (int i = 0; i < 8; i++) {
+    highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+  }
+  round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+  store_buffer_64x32(buf1, coeff, /*stride=*/32);
 }
 
 void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
-  int32x4_t in[128];
-  int32x4_t *outcoef128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
-  const int txw_idx = get_txw_idx(TX_32X16);
-  const int txh_idx = get_txh_idx(TX_32X16);
-  const fwd_transform_1d_neon col_txfm = row_highbd_txfm8x32_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-
-  // column transform
-  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
-  col_txfm(in, in, bitcol, 8);
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  col_txfm_16x16_rounding(&in[0], &v_shift);
-  col_txfm_16x16_rounding(&in[64], &v_shift);
-  transpose_8nx8n(in, outcoef128, 32, 16);
-
-  // row transform
-  for (int i = 0; i < 4; i++) {
-    row_txfm((outcoef128 + i), (in + i), bitrow, 4);
-  }
-  av1_round_shift_rect_array_32_neon(in, outcoef128, 128, -shift[2], NewSqrt2);
   (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm16_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm32_x4_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[3][2];
+  int bitrow = av1_fwd_cos_bit_row[3][2];
+
+  // Column-wise transform.
+  int32x4_t buf0[128];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+           /*hm_stride=*/16);
+  shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+  int32x4_t buf1[128];
+  transpose_arrays_s32_32x16(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
 }
 
 #if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
-  int32x4_t in[64];
-  int32x4_t *outcoef128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
-  const int txw_idx = get_txw_idx(TX_8X32);
-  const int txh_idx = get_txh_idx(TX_8X32);
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm32x8_arr[tx_type];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-
-  const int txfm_size_col = tx_size_wide[TX_8X32];
-  const int txfm_size_row = tx_size_high[TX_8X32];
-  const int num_col = txfm_size_col >> 2;
-
-  // column transform
-  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
-  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
-                   stride, 0, 0, shift[0]);
-
-  for (int i = 0; i < num_col; i++) {
-    col_txfm((in + i), (in + i), bitcol, num_col);
-  }
-
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  col_txfm_16x16_rounding(in, &v_shift);
-  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
-
-  // row transform
-  for (int i = 0; i < txfm_size_col; i += 2) {
-    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
-  }
   (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm32_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm8_xn_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[1][3];
+  int bitrow = av1_fwd_cos_bit_row[1][3];
+
+  // Column-wise transform.
+  int32x4_t buf0[64];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+           /*hm_stride=*/32);
+  shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+  int32x4_t buf1[64];
+  transpose_arrays_s32_8x32(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
 }
 
 void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
-  int32x4_t in[64];
-  int32x4_t *outcoef128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
-  const int txw_idx = get_txw_idx(TX_32X8);
-  const int txh_idx = get_txh_idx(TX_32X8);
-  const fwd_transform_1d_neon col_txfm = row_highbd_txfm32x8_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-
-  const int txfm_size_col = tx_size_wide[TX_32X8];
-  const int txfm_size_row = tx_size_high[TX_32X8];
-  const int num_col = txfm_size_row >> 2;
-
-  // column transform
-  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
-  for (int i = 0; i < txfm_size_row; i += 2) {
-    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
-  }
-
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  col_txfm_16x16_rounding(&in[0], &v_shift);
-  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
-
-  // row transform
-  for (int i = 0; i < num_col; i++) {
-    row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
-  }
   (void)bd;
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm8_xn_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm32_x4_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[3][1];
+  int bitrow = av1_fwd_cos_bit_row[3][1];
+
+  // Column-wise transform.
+  int32x4_t buf0[64];
+  col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+           /*hm_stride=*/8);
+  shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+  int32x4_t buf1[64];
+  transpose_arrays_s32_32x8(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
 }
 #endif
 
 void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
                              TX_TYPE tx_type, int bd) {
-  int32x4_t in[8];
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
-  const int txw_idx = get_txw_idx(TX_4X8);
-  const int txh_idx = get_txh_idx(TX_4X8);
-  const int txfm_size_col = tx_size_wide[TX_4X8];
-  const int txfm_size_row = tx_size_high[TX_4X8];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x8_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+  (void)bd;
+  int bitcol = av1_fwd_cos_bit_col[0][1];
+  int bitrow = av1_fwd_cos_bit_row[0][1];
+  const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_rect_highbd_txfm4_xn_arr[tx_type];
 
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  load_buffer_4x8(input, in, stride, ud_flip, lr_flip, &v_shift0);
-  col_txfm(in, in, bitcol, 1);
-  int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  col_txfm_4x8_rounding(in, &v_shift1);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
 
-  for (int i = 0; i < 2; i++) {
-    int32x4_t *cur_in = &in[i * 4];
-    transpose_4x4(cur_in, cur_in);
-    row_txfm(cur_in, cur_in, bitrow, 1);
-    av1_round_shift_rect_array_32_neon(cur_in, cur_in, txfm_size_col, -shift[2],
-                                       NewSqrt2);
-    store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4);
-  }
-  (void)bd;
+  // Column-wise transform.
+  int32x4_t buf0[8];
+  col_txfm(input, buf0, stride, bitcol, lr_flip);
+  shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+  int32x4_t buf1[8];
+  transpose_arrays_s32_4x8(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
 }
 
 void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
                              TX_TYPE tx_type, int bd) {
-  int32x4_t in[8];
-  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
-  const int txw_idx = get_txw_idx(TX_8X4);
-  const int txh_idx = get_txh_idx(TX_8X4);
-  const int txfm_size_col = tx_size_wide[TX_8X4];
-  const int txfm_size_row = tx_size_high[TX_8X4];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
-  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
-  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x8_arr[tx_type];
+  (void)bd;
+  const int bitcol = av1_fwd_cos_bit_col[1][0];
+  const int bitrow = av1_fwd_cos_bit_row[1][0];
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm4_xn_arr[tx_type];
+  const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
+
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  // col tranform
-  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  load_buffer_8x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
-  for (int i = 0; i < 2; i++) {
-    int32x4_t *cur_in = &in[i * txfm_size_row];
-    col_txfm(cur_in, cur_in, bitcol, 1);
-    transpose_4x4(cur_in, cur_in);
-  }
-  int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
-  col_txfm_4x8_rounding(in, &v_shift1);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
 
-  // row tranform
-  row_txfm(in, outcoeff128, bitrow, 1);
-  av1_round_shift_rect_array_32_neon(outcoeff128, outcoeff128, txfm_size_col,
-                                     -shift[2], NewSqrt2);
-  (void)bd;
+  // Column-wise transform.
+  int32x4_t buf0[8];
+  if (lr_flip) {
+    col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
+             /*hm_stride=*/-4);
+  } else {
+    col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+             /*hm_stride=*/4);
+  }
+
+  shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+  int32x4_t buf1[8];
+  transpose_arrays_s32_8x4(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, coeff, bitrow, /*stride=*/4);
 }
 
 #if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
-  int32x4_t in[256];
-  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
-  const int txw_idx = get_txw_idx(TX_16X64);
-  const int txh_idx = get_txh_idx(TX_16X64);
-  const int txfm_size_col = tx_size_wide[TX_16X64];
-  const int txfm_size_row = tx_size_high[TX_16X64];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  (void)bd;
+  const int bitcol = av1_fwd_cos_bit_col[2][4];
+  const int bitrow = av1_fwd_cos_bit_row[2][4];
+
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int num_col = txfm_size_col >> 2;
-  // col tranform
-  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  for (int i = 0; i < txfm_size_row; i += num_col) {
-    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
-                    ud_flip, lr_flip, &v_shift0);
-    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
-                    ud_flip, lr_flip, &v_shift0);
-    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
-                    ud_flip, lr_flip, &v_shift0);
-    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
-                    ud_flip, lr_flip, &v_shift0);
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
+
+  // Column-wise transform.
+  int32x4_t buf0[256];
+  load_buffer_16x64(input, buf0, stride, lr_flip);
+  for (int i = 0; i < 4; i++) {
+    highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
   }
+  shift_right_2_round_s32_x4(buf0, buf0, 256);
 
-  for (int i = 0; i < num_col; i++) {
-    av1_fdct64_new_neon(in + i, outcoeff128 + i, bitcol, num_col, num_col);
-  }
+  int32x4_t buf1[256];
+  transpose_arrays_s32_16x64(buf0, buf1);
 
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  col_txfm_16x16_rounding(outcoeff128, &v_shift);
-  col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
-  col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
-  col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
-
-  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
-  fdct16x16_neon(in, outcoeff128, bitrow, 8);
-  (void)bd;
+  // Row-wise transform.
+  highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
+  store_buffer_16x32(buf1, coeff, /*stride=*/32);
 }
 
 void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
-  int32x4_t in[256];
-  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
-  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
-  const int txw_idx = get_txw_idx(TX_64X16);
-  const int txh_idx = get_txh_idx(TX_64X16);
-  const int txfm_size_col = tx_size_wide[TX_64X16];
-  const int txfm_size_row = tx_size_high[TX_64X16];
-  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
-  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  (void)bd;
+  const int bitcol = av1_fwd_cos_bit_col[4][2];
+  const int bitrow = av1_fwd_cos_bit_row[4][2];
+
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  // col tranform
-  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
-  for (int i = 0; i < txfm_size_row; i++) {
-    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
-                    ud_flip, lr_flip, &v_shift0);
-    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
-                    ud_flip, lr_flip, &v_shift0);
-    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
-                    ud_flip, lr_flip, &v_shift0);
-    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
-                    ud_flip, lr_flip, &v_shift0);
-  }
+  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
 
-  fdct16x16_neon(in, outcoeff128, bitcol, txfm_size_row);
-  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
-  col_txfm_16x16_rounding(outcoeff128, &v_shift);
-  col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
-  col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
-  col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+  // Column-wise transform.
+  int32x4_t buf0[256];
+  load_buffer_64x16(input, buf0, stride, lr_flip);
+  highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
+  shift_right_4_round_s32_x4(buf0, buf0, 256);
 
-  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+  int32x4_t buf1[256];
+  transpose_arrays_s32_64x16(buf0, buf1);
+
+  // Row-wise transform.
   for (int i = 0; i < 4; i++) {
-    av1_fdct64_new_neon(in + i, outcoeff128 + i, bitrow, 4, 4);
+    highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
   }
-  memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
-  (void)bd;
+  store_buffer_64x16(buf1, coeff, /*stride=*/16);
+  memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
 }
 #endif
 
-static void fdct64_new_neon(int32x4_t *input, int32x4_t *output,
-                            const int8_t cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 64;
-  const int num_per_128 = 4;
-  int col_num = txfm_size / num_per_128;
-  (void)stage_range;
-  for (int col = 0; col < col_num; col++) {
-    av1_fdct64_new_neon((input + col), (output + col), cos_bit, col_num,
-                        col_num);
-  }
-}
-
-static void fdct32_new_neon(int32x4_t *input, int32x4_t *output,
-                            const int8_t cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 32;
-  const int num_per_128 = 4;
-  int col_num = txfm_size / num_per_128;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    av1_fdct32_new_neon((input + col), (output + col), cos_bit, col_num);
-  }
-}
-
-static void idtx32x32_neon(int32x4_t *input, int32x4_t *output,
-                           const int8_t cos_bit, const int8_t *stage_range) {
-  (void)stage_range;
-
-  for (int i = 0; i < 8; i++) {
-    av1_idtx32_new_neon(&input[i * 32], &output[i * 32], cos_bit, 1);
-  }
-}
-
-typedef void (*TxfmFuncNEON)(int32x4_t *input, int32x4_t *output,
-                             const int8_t cos_bit, const int8_t *stage_range);
-
-static INLINE TxfmFuncNEON fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
-  switch (txfm_type) {
-    case TXFM_TYPE_DCT32: return fdct32_new_neon;
-    case TXFM_TYPE_DCT64: return fdct64_new_neon;
-    case TXFM_TYPE_IDENTITY32: return idtx32x32_neon;
-    default: assert(0);
-  }
-  return NULL;
-}
-
-static INLINE void int16_array_with_stride_to_int32_array_without_stride(
-    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
-  int r, c;
-  for (r = 0; r < txfm1d_size; r++) {
-    for (c = 0; c < txfm1d_size; c++) {
-      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
-    }
-  }
-}
-
-static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
-                                                 int32x4_t *output,
-                                                 const int size,
-                                                 const int bit) {
-  const int32x4_t v_bit = vdupq_n_s32(-bit);
-  for (int i = 0; i < size; i++) output[i] = vrshlq_s32(input[i], v_bit);
-}
-
-static INLINE void transpose_32_4x4(int stride, const int32x4_t *input,
-                                    int32x4_t *output) {
-  int32x4x2_t temp01 = vzipq_s32(input[0 * stride], input[2 * stride]);
-  int32x4x2_t temp23 = vzipq_s32(input[1 * stride], input[3 * stride]);
-
-  const int32x4x2_t output01 = vzipq_s32(temp01.val[0], temp23.val[0]);
-  const int32x4x2_t output23 = vzipq_s32(temp01.val[1], temp23.val[1]);
-
-  output[0 * stride] = output01.val[0];
-  output[1 * stride] = output01.val[1];
-  output[2 * stride] = output23.val[0];
-  output[3 * stride] = output23.val[1];
-}
-
-static INLINE void transpose_32(int txfm_size, const int32x4_t *input,
-                                int32x4_t *output) {
-  const int num_per_128 = 4;
-  const int row_size = txfm_size;
-  const int col_size = txfm_size / num_per_128;
-  int r, c;
-
-  // transpose each 4x4 block internally
-  for (r = 0; r < row_size; r += 4) {
-    for (c = 0; c < col_size; c++) {
-      transpose_32_4x4(col_size, &input[r * col_size + c],
-                       &output[c * 4 * col_size + r / 4]);
-    }
-  }
-}
-
-static INLINE void fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
-                                         const int stride,
-                                         const TXFM_2D_FLIP_CFG *cfg,
-                                         int32_t *txfm_buf) {
-  assert(cfg->tx_size < TX_SIZES);
-  const int txfm_size = tx_size_wide[cfg->tx_size];
-  const int8_t *shift = cfg->shift;
-  const int8_t *stage_range_col = cfg->stage_range_col;
-  const int8_t cos_bit_col = cfg->cos_bit_col;
-  const int8_t cos_bit_row = cfg->cos_bit_row;
-  const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
-  int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
-  int32x4_t *out_128 = (int32x4_t *)output;
-
-  const int num_per_128 = 4;
-  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
-  int col_num = txfm_size / num_per_128;
-
-  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
-                                                        txfm_size);
-  /*col wise transform*/
-  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
-  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
-  transpose_32(txfm_size, out_128, buf_128);
-
-  /*row wise transform*/
-  for (int col = 0; col < (col_num >> 1); col++) {
-    av1_fdct64_new_neon((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
-                        (col_num >> 1));
-  }
-
-  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
-  av1_round_shift_array_32_neon(out_128, out_128, txfm2d_size_128, -shift[2]);
-}
-
-static INLINE void fwd_txfm2d_neon(const int16_t *input, int32_t *output,
-                                   const int stride,
-                                   const TXFM_2D_FLIP_CFG *cfg,
-                                   int32_t *txfm_buf) {
-  assert(cfg->tx_size < TX_SIZES);
-  const int txfm_size = tx_size_wide[cfg->tx_size];
-  const int8_t *shift = cfg->shift;
-  const int8_t *stage_range_col = cfg->stage_range_col;
-  const int8_t *stage_range_row = cfg->stage_range_row;
-  const int8_t cos_bit_col = cfg->cos_bit_col;
-  const int8_t cos_bit_row = cfg->cos_bit_row;
-  const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
-  const TxfmFuncNEON txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
-
-  int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
-  int32x4_t *out_128 = (int32x4_t *)output;
-  int num_per_128 = 4;
-  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
-
-  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
-                                                        txfm_size);
-  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[0]);
-  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
-  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
-  transpose_32(txfm_size, out_128, buf_128);
-  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
-  av1_round_shift_array_32_neon(out_128, out_128, txfm2d_size_128, -shift[2]);
-}
-
 void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
-  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
-  TXFM_2D_FLIP_CFG cfg;
-  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
   (void)bd;
-  fwd_txfm2d_neon(input, output, stride, &cfg, txfm_buf);
+  const fwd_transform_1d_col_many_neon col_txfm =
+      col_highbd_txfm32_x4_arr[tx_type];
+  const fwd_transform_1d_row_many_neon row_txfm =
+      row_highbd_txfm32_x4_arr[tx_type];
+
+  // Column-wise transform.
+  int32x4_t buf0[256];
+  col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
+           /*hm_stride=*/32);
+  shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+  int32x4_t buf1[256];
+  transpose_arrays_s32_32x32(buf0, buf1);
+
+  // Row-wise transform.
+  row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
+           /*stride=*/32);
 }
 
 void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
-  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
-  TXFM_2D_FLIP_CFG cfg;
-  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
   (void)bd;
-  fwd_txfm2d_64x64_neon(input, output, stride, &cfg, txfm_buf);
+  (void)tx_type;
+
+  // Column-wise transform.
+  int32x4_t buf0[1024];
+  load_buffer_64x64(input, buf0, stride, 0);
+  for (int col = 0; col < 16; col++) {
+    highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
+  }
+  shift_right_2_round_s32_x4(buf0, buf0, 1024);
+
+  int32x4_t buf1[1024];
+  transpose_arrays_s32_64x64(buf0, buf1);
+
+  // Row-wise transform.
+  for (int col = 0; col < 8; col++) {
+    highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
+  }
+  shift_right_2_round_s32_x4(buf1, buf1, 512);
+  store_buffer_64x32(buf1, output, /*stride=*/32);
 }
diff --git a/av1/encoder/arm/neon/highbd_pickrst_neon.c b/av1/encoder/arm/neon/highbd_pickrst_neon.c
new file mode 100644
index 0000000..76e0344
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -0,0 +1,741 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void highbd_calc_proj_params_r0_r1_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t h01_lo = vdupq_n_s64(0);
+  int64x2_t h01_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint16_t *src_ptr = src;
+    const uint16_t *dat_ptr = dat;
+    int32_t *flt0_ptr = flt0;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      uint16x8_t d = vld1q_u16(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int32x4_t u_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t u_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t s_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+      int32x4_t s_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+      s_lo = vsubq_s32(s_lo, u_lo);
+      s_hi = vsubq_s32(s_hi, u_hi);
+
+      f0_lo = vsubq_s32(f0_lo, u_lo);
+      f0_hi = vsubq_s32(f0_hi, u_hi);
+      f1_lo = vsubq_s32(f1_lo, u_lo);
+      f1_hi = vsubq_s32(f1_hi, u_hi);
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+      h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+      h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+      h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src += src_stride;
+    dat += dat_stride;
+    flt0 += flt0_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  H[1][0] = H[0][1];
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r0_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+
+  do {
+    const uint16_t *src_ptr = src;
+    const uint16_t *dat_ptr = dat;
+    int32_t *flt0_ptr = flt0;
+    int w = width;
+
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      uint16x8_t d = vld1q_u16(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+      int32x4_t u_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t u_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t s_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+      int32x4_t s_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+      s_lo = vsubq_s32(s_lo, u_lo);
+      s_hi = vsubq_s32(s_hi, u_hi);
+
+      f0_lo = vsubq_s32(f0_lo, u_lo);
+      f0_hi = vsubq_s32(f0_hi, u_hi);
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src += src_stride;
+    dat += dat_stride;
+    flt0 += flt0_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r1_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint16_t *src_ptr = src;
+    const uint16_t *dat_ptr = dat;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr);
+      uint16x8_t d = vld1q_u16(dat_ptr);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int32x4_t u_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t u_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+      int32x4_t s_lo =
+          vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+      int32x4_t s_hi = vreinterpretq_s32_u32(
+          vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+      s_lo = vsubq_s32(s_lo, u_lo);
+      s_hi = vsubq_s32(s_hi, u_hi);
+
+      f1_lo = vsubq_s32(f1_lo, u_lo);
+      f1_hi = vsubq_s32(f1_hi, u_hi);
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src += src_stride;
+    dat += dat_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+//    of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+//    non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+//    non-zero and need to be computed.
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
+                                       int height, int src_stride,
+                                       const uint8_t *dat8, int dat_stride,
+                                       int32_t *flt0, int flt0_stride,
+                                       int32_t *flt1, int flt1_stride,
+                                       int64_t H[2][2], int64_t C[2],
+                                       const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, flt1,
+                                       flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static int16_t highbd_find_average_neon(const int16_t *src, int src_stride,
+                                        int width, int height) {
+  assert(width > 0);
+  assert(height > 0);
+
+  int64x2_t sum_s64 = vdupq_n_s64(0);
+  int64_t sum = 0;
+
+  int h = height;
+  do {
+    int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+    int w = width;
+    const int16_t *row = src;
+    while (w >= 32) {
+      int16x8_t s0 = vld1q_s16(row + 0);
+      int16x8_t s1 = vld1q_s16(row + 8);
+      int16x8_t s2 = vld1q_s16(row + 16);
+      int16x8_t s3 = vld1q_s16(row + 24);
+
+      s0 = vaddq_s16(s0, s1);
+      s2 = vaddq_s16(s2, s3);
+      sum_s32[0] = vpadalq_s16(sum_s32[0], s0);
+      sum_s32[1] = vpadalq_s16(sum_s32[1], s2);
+
+      row += 32;
+      w -= 32;
+    }
+
+    if (w >= 16) {
+      int16x8_t s0 = vld1q_s16(row + 0);
+      int16x8_t s1 = vld1q_s16(row + 8);
+
+      s0 = vaddq_s16(s0, s1);
+      sum_s32[0] = vpadalq_s16(sum_s32[0], s0);
+
+      row += 16;
+      w -= 16;
+    }
+
+    if (w >= 8) {
+      int16x8_t s0 = vld1q_s16(row);
+      sum_s32[1] = vpadalq_s16(sum_s32[1], s0);
+
+      row += 8;
+      w -= 8;
+    }
+
+    if (w >= 4) {
+      int16x8_t s0 = vcombine_s16(vld1_s16(row), vdup_n_s16(0));
+      sum_s32[0] = vpadalq_s16(sum_s32[0], s0);
+
+      row += 4;
+      w -= 4;
+    }
+
+    while (w-- > 0) {
+      sum += *row++;
+    }
+
+    sum_s64 = vpadalq_s32(sum_s64, vaddq_s32(sum_s32[0], sum_s32[1]));
+
+    src += src_stride;
+  } while (--h != 0);
+  return (int16_t)((horizontal_add_s64x2(sum_s64) + sum) / (height * width));
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+                                     const int wiener_win,
+                                     const int wiener_win2) {
+  for (int row0 = 0; row0 < wiener_win; row0++) {
+    for (int row1 = row0; row1 < wiener_win; row1++) {
+      int auto_cov_idx =
+          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+      int32x4_t auto_cov =
+          vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1]));
+      auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]),
+                           vget_high_s16(dgd[row1]));
+
+      H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+    }
+  }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win7_neon(
+    const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride,
+    int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) {
+  const int wiener_win = 7;
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+  // We use an intermediate matrix that will be transposed to get M.
+  int64_t M_trn[49];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int16x8_t vavg = vdupq_n_s16(avg);
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg);
+      int j = 0;
+      while (j <= width - 8) {
+        int16x8_t dgd1 =
+            vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg);
+        int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+        dgd0 = dgd1;
+        j += 8;
+      }
+      // Process remaining elements without Neon.
+      while (j < width) {
+        int16_t s = src[j] - avg;
+        int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg;
+        int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg;
+        int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg;
+        int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg;
+        int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg;
+        int16_t d5 = dgd[row * dgd_stride + 5 + j] - avg;
+        int16_t d6 = dgd[row * dgd_stride + 6 + j] - avg;
+
+        M_trn[row * wiener_win + 0] += d0 * s;
+        M_trn[row * wiener_win + 1] += d1 * s;
+        M_trn[row * wiener_win + 2] += d2 * s;
+        M_trn[row * wiener_win + 3] += d3 * s;
+        M_trn[row * wiener_win + 4] += d4 * s;
+        M_trn[row * wiener_win + 5] += d5 * s;
+        M_trn[row * wiener_win + 6] += d6 * s;
+
+        j++;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        int16x8_t dgd0[7];
+        dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+        dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+        dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+        dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+        dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+        dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg);
+        dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column.
+          int16x8_t dgd1[7];
+          dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+          dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+          dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+          dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+          dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+          dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg);
+          dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+      j += 8;
+    }
+
+    if (j < width) {
+      // Process remaining columns using a mask to discard excess elements.
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[7];
+        dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+        dgd0[0] = vandq_s16(dgd0[0], mask);
+        dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+        dgd0[1] = vandq_s16(dgd0[1], mask);
+        dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+        dgd0[2] = vandq_s16(dgd0[2], mask);
+        dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+        dgd0[3] = vandq_s16(dgd0[3], mask);
+        dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+        dgd0[4] = vandq_s16(dgd0[4], mask);
+        dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg);
+        dgd0[5] = vandq_s16(dgd0[5], mask);
+        dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg);
+        dgd0[6] = vandq_s16(dgd0[6], mask);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column.
+          int16x8_t dgd1[7];
+          dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+          dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+          dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+          dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+          dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+          dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg);
+          dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+    }
+    dgd += dgd_stride;
+    src += src_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  transpose_M_win7(M, M_trn, 7);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, wiener_win2);
+
+  // Scaling the results.
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12) {
+    bit_depth_divider = 16;
+  } else if (bit_depth == AOM_BITS_10) {
+    bit_depth_divider = 4;
+  }
+
+  for (int i = 0; i < wiener_win2; ++i) {
+    M[i] /= bit_depth_divider;
+    for (int j = 0; j < wiener_win2; ++j) {
+      H[i * wiener_win2 + j] /= bit_depth_divider;
+    }
+  }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void highbd_compute_stats_win5_neon(
+    const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride,
+    int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) {
+  const int wiener_win = 5;
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+  // We use an intermediate matrix that will be transposed to get M.
+  int64_t M_trn[25];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int16x8_t vavg = vdupq_n_s16(avg);
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg);
+      int j = 0;
+      while (j <= width - 8) {
+        int16x8_t dgd1 =
+            vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg);
+        int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+        dgd0 = dgd1;
+        j += 8;
+      }
+      // Process remaining elements without Neon.
+      while (j < width) {
+        int16_t s = src[j] - avg;
+        int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg;
+        int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg;
+        int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg;
+        int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg;
+        int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg;
+
+        M_trn[row * wiener_win + 0] += d0 * s;
+        M_trn[row * wiener_win + 1] += d1 * s;
+        M_trn[row * wiener_win + 2] += d2 * s;
+        M_trn[row * wiener_win + 3] += d3 * s;
+        M_trn[row * wiener_win + 4] += d4 * s;
+
+        j++;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+        dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+        dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+        dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+        dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column.
+          int16x8_t dgd1[5];
+          dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+          dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+          dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+          dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+          dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+      j += 8;
+    }
+
+    if (j < width) {
+      // Process remaining columns using a mask to discard excess elements.
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg);
+        dgd0[0] = vandq_s16(dgd0[0], mask);
+        dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg);
+        dgd0[1] = vandq_s16(dgd0[1], mask);
+        dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg);
+        dgd0[2] = vandq_s16(dgd0[2], mask);
+        dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg);
+        dgd0[3] = vandq_s16(dgd0[3], mask);
+        dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg);
+        dgd0[4] = vandq_s16(dgd0[4], mask);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column.
+          int16x8_t dgd1[5];
+          dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg);
+          dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg);
+          dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg);
+          dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg);
+          dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+    }
+    dgd += dgd_stride;
+    src += src_stride;
+  } while (--height != 0);
+
+  // Transpose M_trn.
+  transpose_M_win5(M, M_trn, 5);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, wiener_win2);
+
+  // Scaling the results.
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12) {
+    bit_depth_divider = 16;
+  } else if (bit_depth == AOM_BITS_10) {
+    bit_depth_divider = 4;
+  }
+
+  for (int i = 0; i < wiener_win2; ++i) {
+    M[i] /= bit_depth_divider;
+    for (int j = 0; j < wiener_win2; ++j) {
+      H[i * wiener_win2 + j] /= bit_depth_divider;
+    }
+  }
+}
+
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
+                                   const uint8_t *src8, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, int64_t *M, int64_t *H,
+                                   aom_bit_depth_t bit_depth) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
+
+  const int wiener_halfwin = wiener_win >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+  const int16_t *src = (const int16_t *)CONVERT_TO_SHORTPTR(src8);
+  const int16_t *dgd = (const int16_t *)CONVERT_TO_SHORTPTR(dgd8);
+  const int height = v_end - v_start;
+  const int width = h_end - h_start;
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+
+  int16_t avg = highbd_find_average_neon(dgd + v_start * dgd_stride + h_start,
+                                         dgd_stride, width, height);
+
+  src += v_start * src_stride + h_start;
+  dgd += vert_offset * dgd_stride + horiz_offset;
+
+  if (wiener_win == WIENER_WIN) {
+    highbd_compute_stats_win7_neon(dgd, dgd_stride, src, src_stride, width,
+                                   height, M, H, avg, bit_depth);
+  } else {
+    highbd_compute_stats_win5_neon(dgd, dgd_stride, src, src_stride, width,
+                                   height, M, H, avg, bit_depth);
+  }
+}
diff --git a/av1/encoder/arm/neon/highbd_rdopt_neon.c b/av1/encoder/arm/neon/highbd_rdopt_neon.c
new file mode 100644
index 0000000..4bf7ae6
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_rdopt_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int32x4_t c = vld1q_s32(coeff);
+    const int32x4_t d = vld1q_s32(dqcoeff);
+
+    const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+    err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+    err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+    ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+    ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+    coeff += 4;
+    dqcoeff += 4;
+    block_size -= 4;
+  } while (block_size != 0);
+
+  *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift;
+  return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift;
+}
diff --git a/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
new file mode 100644
index 0000000..88e176f
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_squared_error(
+    const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2,
+    const uint32_t stride2, const uint32_t block_width,
+    const uint32_t block_height, uint32_t *frame_sse,
+    const unsigned int dst_stride) {
+  uint32_t *dst = frame_sse;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j);
+      uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j);
+
+      uint16x8_t abs_diff = vabdq_u16(s, r);
+      uint32x4_t sse_lo =
+          vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff));
+      uint32x4_t sse_hi =
+          vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff));
+
+      vst1q_u32(dst + j, sse_lo);
+      vst1q_u32(dst + j + 4, sse_hi);
+
+      j += 8;
+    } while (j < block_width);
+
+    dst += dst_stride;
+    i++;
+  } while (i < block_height);
+}
+
+static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2],
+                                          const uint32x4_t mask_single) {
+  uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single);
+  vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single);
+  return horizontal_add_u32x4(vsums);
+}
+
+static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2],
+                                            const uint32x4_t mask1,
+                                            const uint32x4_t mask2) {
+  uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[1][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[2][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[3][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[4][0], mask1);
+  vsums = vmlaq_u32(vsums, vsrc[0][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[1][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[2][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[3][1], mask2);
+  vsums = vmlaq_u32(vsums, vsrc[4][1], mask2);
+  return vsums;
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame, const unsigned int stride,
+    const uint32_t block_width, const uint32_t block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    const uint32_t *frame_sse, const uint32_t frame_sse_stride,
+    const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
+    int bd) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW] = { 0 };
+  const int half_window = TF_WINDOW_LENGTH >> 1;
+
+  uint32x4_t vsrc[5][2] = { 0 };
+  const uint32x4_t k0000 = vdupq_n_u32(0);
+  const uint32x4_t k1111 = vdupq_n_u32(1);
+  const uint32_t k3110_u32[4] = { 0, 1, 1, 3 };
+  const uint32_t k2111_u32[4] = { 1, 1, 1, 2 };
+  const uint32_t k1112_u32[4] = { 2, 1, 1, 1 };
+  const uint32_t k0113_u32[4] = { 3, 1, 1, 0 };
+  const uint32x4_t k3110 = vld1q_u32(k3110_u32);
+  const uint32x4_t k2111 = vld1q_u32(k2111_u32);
+  const uint32x4_t k1112 = vld1q_u32(k1112_u32);
+  const uint32x4_t k0113 = vld1q_u32(k0113_u32);
+
+  uint32x4_t vmask1[4], vmask2[4];
+  vmask1[0] = k1111;
+  vmask2[0] = vextq_u32(k1111, k0000, 3);
+  vmask1[1] = vextq_u32(k0000, k1111, 3);
+  vmask2[1] = vextq_u32(k1111, k0000, 2);
+  vmask1[2] = vextq_u32(k0000, k1111, 2);
+  vmask2[2] = vextq_u32(k1111, k0000, 1);
+  vmask1[3] = vextq_u32(k0000, k1111, 1);
+  vmask2[3] = k1111;
+
+  uint32_t row = 0;
+  do {
+    uint32_t col = 0;
+    const uint32_t *src = frame_sse + row * frame_sse_stride;
+    if (row == 0) {
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+      vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+      // First 2 rows of the 5x5 matrix are padded from the 1st.
+      vsrc[0][0] = vsrc[2][0];
+      vsrc[1][0] = vsrc[2][0];
+    } else if (row == 1) {
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+      vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+      // First row of the 5x5 matrix are padded from the 1st.
+      vsrc[0][0] = vsrc[1][0];
+    } else if (row == block_height - 2) {
+      vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+
+      // Last row of the 5x5 matrix are padded from the one before.
+      vsrc[4][0] = vsrc[3][0];
+    } else if (row == block_height - 1) {
+      vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+
+      // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+      vsrc[3][0] = vsrc[2][0];
+      vsrc[4][0] = vsrc[2][0];
+    } else {
+      vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+      vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+      vsrc[2][0] = vld1q_u32(src);
+      vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+      vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+    }
+
+    acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113);
+    acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112);
+
+    col += 4;
+    src += 4;
+    // Traverse 4 columns at a time
+    do {
+      if (row == 0) {
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+        vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+        // First 2 rows of the 5x5 matrix are padded from the 1st.
+        vsrc[0][1] = vsrc[2][1];
+        vsrc[1][1] = vsrc[2][1];
+      } else if (row == 1) {
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+        vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+        // First row of the 5x5 matrix are padded from the 1st.
+        vsrc[0][1] = vsrc[1][1];
+      } else if (row == block_height - 2) {
+        vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+
+        // Last row of the 5x5 matrix are padded from the one before.
+        vsrc[4][1] = vsrc[3][1];
+      } else if (row == block_height - 1) {
+        vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+
+        // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+        vsrc[3][1] = vsrc[2][1];
+        vsrc[4][1] = vsrc[2][1];
+      } else {
+        vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+        vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+        vsrc[2][1] = vld1q_u32(src);
+        vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+        vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+      }
+
+      uint32x4_t sums[4];
+      sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]);
+      sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]);
+      sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]);
+      sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]);
+      vst1q_u32(&acc_5x5_neon[row][col - half_window],
+                horizontal_add_4d_u32x4(sums));
+
+      vsrc[0][0] = vsrc[0][1];
+      vsrc[1][0] = vsrc[1][1];
+      vsrc[2][0] = vsrc[2][1];
+      vsrc[3][0] = vsrc[3][1];
+      vsrc[4][0] = vsrc[4][1];
+
+      src += 4;
+      col += 4;
+    } while (col <= block_width - 4);
+
+    acc_5x5_neon[row][col - half_window] =
+        sum_kernel5x5_mask_single(vsrc, k2111);
+    acc_5x5_neon[row][col - half_window + 1] =
+        sum_kernel5x5_mask_single(vsrc, k3110);
+
+    row++;
+  } while (row < block_height);
+
+  // Perform filtering.
+  if (tf_wgt_calc_lvl == 0) {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        // Scale down the difference for high bit depth input.
+        const uint32_t diff_sse =
+            (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  } else {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        // Scale down the difference for high bit depth input.
+        const uint32_t diff_sse =
+            (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_neon(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+  assert(is_high_bitdepth);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[BW * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const uint32_t frame_sse_stride = plane_w;
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              const int ww = frame_sse_stride
+                             << ss_x_shift;  // Width of Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+            }
+          }
+        }
+      }
+    }
+    get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                      plane_h, frame_sse, frame_sse_stride);
+
+    highbd_apply_temporal_filter(
+        pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
+        accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
+        luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+
+    plane_offset += plane_h * plane_w;
+  }
+}
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+                                                        int height, int width,
+                                                        int stride,
+                                                        int bitdepth,
+                                                        int edge_thresh) {
+  uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+  uint64x2_t acc = vdupq_n_u64(0);
+  // Count is in theory positive as it counts the number of times we're under
+  // the threshold, but it will be counted negatively in order to make best use
+  // of the vclt instruction, which sets every bit of a lane to 1 when the
+  // condition is true.
+  int32x4_t count = vdupq_n_s32(0);
+  int final_count = 0;
+  uint64_t final_acc = 0;
+  const uint16_t *src_start = src + stride + 1;
+  int h = 1;
+
+  do {
+    int w = 1;
+    const uint16_t *src_ptr = src_start;
+
+    while (w <= (width - 1) - 8) {
+      uint16x8_t mat[3][3];
+      mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+      mat[0][1] = vld1q_u16(src_ptr - stride);
+      mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+      mat[1][0] = vld1q_u16(src_ptr - 1);
+      mat[1][1] = vld1q_u16(src_ptr);
+      mat[1][2] = vld1q_u16(src_ptr + 1);
+      mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+      mat[2][1] = vld1q_u16(src_ptr + stride);
+      mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+      ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionnally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+      uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+      v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+      uint32x4_t v_u32 = vpaddlq_u16(v);
+
+      acc = vpadalq_u32(acc, v_u32);
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 8;
+      src_ptr += 8;
+    }
+
+    if (w <= (width - 1) - 4) {
+      uint16x4_t mat[3][3];
+      mat[0][0] = vld1_u16(src_ptr - stride - 1);
+      mat[0][1] = vld1_u16(src_ptr - stride);
+      mat[0][2] = vld1_u16(src_ptr - stride + 1);
+      mat[1][0] = vld1_u16(src_ptr - 1);
+      mat[1][1] = vld1_u16(src_ptr);
+      mat[1][2] = vld1_u16(src_ptr + 1);
+      mat[2][0] = vld1_u16(src_ptr + stride - 1);
+      mat[2][1] = vld1_u16(src_ptr + stride);
+      mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+      uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+      gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+      gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+      uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+      uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+      gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+      gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+      uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+      ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionnally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+      uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+      uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+      uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+      uint16x4_t adj = vadd_u16(adj0, adj1);
+      adj = vadd_u16(adj, adj);
+
+      uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+      uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+      uint16x4_t diag = vadd_u16(diag0, diag1);
+
+      uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+      v = vand_u16(v, thresh_u16);
+      uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+      acc = vpadalq_u32(acc, v_u32);
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+      w += 4;
+      src_ptr += 4;
+    }
+
+    while (w < width - 1) {
+      int mat[3][3];
+      mat[0][0] = *(src_ptr - stride - 1);
+      mat[0][1] = *(src_ptr - stride);
+      mat[0][2] = *(src_ptr - stride + 1);
+      mat[1][0] = *(src_ptr - 1);
+      mat[1][1] = *(src_ptr);
+      mat[1][2] = *(src_ptr + 1);
+      mat[2][0] = *(src_ptr + stride - 1);
+      mat[2][1] = *(src_ptr + stride);
+      mat[2][2] = *(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+      // Accumulate Laplacian.
+      const int is_under = ga < edge_thresh;
+      const int v = 4 * mat[1][1] -
+                    2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                    (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+      final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+      final_count += is_under;
+
+      src_ptr++;
+      w++;
+    }
+    src_start += stride;
+  } while (++h < height - 1);
+
+  // We counted negatively, so subtract to get the final value.
+  final_count -= horizontal_add_s32x4(count);
+  final_acc += horizontal_add_u64x2(acc);
+  return (final_count < 16)
+             ? -1.0
+             : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/av1/encoder/arm/neon/pickrst_neon.c b/av1/encoder/arm/neon/pickrst_neon.c
new file mode 100644
index 0000000..6227028
--- /dev/null
+++ b/av1/encoder/arm/neon/pickrst_neon.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const int32x4_t zero = vdupq_n_s32(0);
+  uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      int32x4_t err0 = zero;
+      for (j = 0; j <= width - 8; j += 8) {
+        const uint8x8_t d0 = vld1_u8(&dat[j]);
+        const uint8x8_t s0 = vld1_u8(&src[j]);
+        const int16x8_t flt0_16b =
+            vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
+                         vqmovn_s32(vld1q_s32(&flt0[j + 4])));
+        const int16x8_t flt1_16b =
+            vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
+                         vqmovn_s32(vld1q_s32(&flt1[j + 4])));
+        const int16x8_t u0 =
+            vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
+        const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
+        const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
+        const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
+        const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
+        const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
+        const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
+
+        int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
+        v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
+        int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
+        v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
+        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+        const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
+                                       vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
+        const int16x4_t e0_lo = vget_low_s16(e0);
+        const int16x4_t e0_hi = vget_high_s16(e0);
+        err0 = vmlal_s16(err0, e0_lo, e0_lo);
+        err0 = vmlal_s16(err0, e0_hi, e0_hi);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+    }
+
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      int32x4_t err0 = zero;
+      for (j = 0; j <= width - 8; j += 8) {
+        const uint8x8_t d0 = vld1_u8(&dat[j]);
+        const uint8x8_t s0 = vld1_u8(&src[j]);
+        const uint16x8_t d0s0 = vsubl_u8(d0, s0);
+        const uint16x8x2_t d0w =
+            vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
+
+        const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
+        const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
+
+        int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
+        v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
+                         xq_active * (1 << SGRPROJ_RST_BITS));
+        int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
+        v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
+                         xq_active * (1 << SGRPROJ_RST_BITS));
+        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+        const int16x8_t e0 =
+            vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
+        const int16x4_t e0_lo = vget_low_s16(e0);
+        const int16x4_t e0_hi = vget_high_s16(e0);
+        err0 = vmlal_s16(err0, e0_lo, e0_lo);
+        err0 = vmlal_s16(err0, e0_hi, e0_hi);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+    }
+  } else {
+    uint32x4_t err0 = vreinterpretq_u32_s32(zero);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const uint8x16_t d = vld1q_u8(&dat[j]);
+        const uint8x16_t s = vld1q_u8(&src[j]);
+        const uint8x16_t diff = vabdq_u8(d, s);
+        const uint8x8_t diff0 = vget_low_u8(diff);
+        const uint8x8_t diff1 = vget_high_u8(diff);
+        err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
+        err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    sum64 = vpaddlq_u32(err0);
+  }
+#if AOM_ARCH_AARCH64
+  err += vaddvq_u64(sum64);
+#else
+  err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
+#endif  // AOM_ARCH_AARCH64
+  return err;
+}
+
+static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride,
+                                        int width, int height) {
+  uint64_t sum = 0;
+
+  if (width >= 16) {
+    int h = 0;
+    // We can accumulate up to 257 8-bit values in a 16-bit value, given
+    // that each 16-bit vector has 8 elements, that means we can process up to
+    // int(257*8/width) rows before we need to widen to 32-bit vector
+    // elements.
+    int h_overflow = 257 * 8 / width;
+    int h_limit = height > h_overflow ? h_overflow : height;
+    uint32x4_t avg_u32 = vdupq_n_u32(0);
+    do {
+      uint16x8_t avg_u16 = vdupq_n_u16(0);
+      do {
+        int j = width;
+        const uint8_t *src_ptr = src;
+        do {
+          uint8x16_t s = vld1q_u8(src_ptr);
+          avg_u16 = vpadalq_u8(avg_u16, s);
+          j -= 16;
+          src_ptr += 16;
+        } while (j >= 16);
+        if (j >= 8) {
+          uint8x8_t s = vld1_u8(src_ptr);
+          avg_u16 = vaddw_u8(avg_u16, s);
+          j -= 8;
+          src_ptr += 8;
+        }
+        // Scalar tail case.
+        while (j > 0) {
+          sum += src[width - j];
+          j--;
+        }
+        src += src_stride;
+      } while (++h < h_limit);
+      avg_u32 = vpadalq_u16(avg_u32, avg_u16);
+
+      h_limit += h_overflow;
+      h_limit = height > h_overflow ? h_overflow : height;
+    } while (h < height);
+    return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
+                     (width * height));
+  }
+  if (width >= 8) {
+    int h = 0;
+    // We can accumulate up to 257 8-bit values in a 16-bit value, given
+    // that each 16-bit vector has 4 elements, that means we can process up to
+    // int(257*4/width) rows before we need to widen to 32-bit vector
+    // elements.
+    int h_overflow = 257 * 4 / width;
+    int h_limit = height > h_overflow ? h_overflow : height;
+    uint32x2_t avg_u32 = vdup_n_u32(0);
+    do {
+      uint16x4_t avg_u16 = vdup_n_u16(0);
+      do {
+        int j = width;
+        const uint8_t *src_ptr = src;
+        uint8x8_t s = vld1_u8(src_ptr);
+        avg_u16 = vpadal_u8(avg_u16, s);
+        j -= 8;
+        src_ptr += 8;
+        // Scalar tail case.
+        while (j > 0) {
+          sum += src[width - j];
+          j--;
+        }
+        src += src_stride;
+      } while (++h < h_limit);
+      avg_u32 = vpadal_u16(avg_u32, avg_u16);
+
+      h_limit += h_overflow;
+      h_limit = height > h_overflow ? h_overflow : height;
+    } while (h < height);
+    return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
+                     (width * height));
+  }
+  int i = height;
+  do {
+    int j = 0;
+    do {
+      sum += src[j];
+    } while (++j < width);
+    src += src_stride;
+  } while (--i != 0);
+  return (uint8_t)(sum / (width * height));
+}
+
+static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
+                                   int16_t *buf_avg, int buf_avg_stride,
+                                   int width, int height,
+                                   int downsample_factor) {
+  uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+  if (width > 8) {
+    int i = 0;
+    do {
+      int j = width;
+      const uint8_t *buf_ptr = buf;
+      int16_t *buf_avg_ptr = buf_avg;
+      do {
+        uint8x8_t d = vld1_u8(buf_ptr);
+        vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
+
+        j -= 8;
+        buf_ptr += 8;
+        buf_avg_ptr += 8;
+      } while (j >= 8);
+      while (j > 0) {
+        *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg;
+        buf_avg_ptr++;
+        j--;
+      }
+      buf += buf_stride;
+      buf_avg += buf_avg_stride;
+      i += downsample_factor;
+    } while (i < height);
+  } else {
+    // For width < 8, don't use Neon.
+    for (int i = 0; i < height; i = i + downsample_factor) {
+      for (int j = 0; j < width; j++) {
+        buf_avg[j] = (int16_t)buf[j] - (int16_t)avg;
+      }
+      buf += buf_stride;
+      buf_avg += buf_avg_stride;
+    }
+  }
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+                                     const int wiener_win,
+                                     const int wiener_win2, int32x4_t df_s32) {
+  for (int row0 = 0; row0 < wiener_win; row0++) {
+    for (int row1 = row0; row1 < wiener_win; row1++) {
+      int auto_cov_idx =
+          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+      int32x4_t auto_cov =
+          vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1]));
+      auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]),
+                           vget_high_s16(dgd[row1]));
+      auto_cov = vshlq_s32(auto_cov, df_s32);
+
+      H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+    }
+  }
+}
+
+static INLINE void compute_H_one_col_last_row(int16x8_t *dgd, int col,
+                                              int64_t *H, const int wiener_win,
+                                              const int wiener_win2,
+                                              int last_row_df) {
+  for (int row0 = 0; row0 < wiener_win; row0++) {
+    for (int row1 = row0; row1 < wiener_win; row1++) {
+      int auto_cov_idx =
+          (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+      int32x4_t auto_cov =
+          vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1]));
+      auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]),
+                           vget_high_s16(dgd[row1]));
+      auto_cov = vmulq_n_s32(auto_cov, last_row_df);
+
+      H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+    }
+  }
+}
+
+// When we load 8 values of int16_t type and need less than 8 values for
+// processing, the below mask is used to make the extra values zero.
+const int16_t av1_neon_mask_16bit[16] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win7_neon(int16_t *dgd_avg, int dgd_avg_stride,
+                                           int16_t *src_avg, int src_avg_stride,
+                                           int width, int v_start, int v_end,
+                                           int64_t *M, int64_t *H,
+                                           int downsample_factor,
+                                           int last_row_downsample_factor) {
+  const int wiener_win = 7;
+  const int wiener_win2 = wiener_win * wiener_win;
+  // The downsample factor can be either 1 or 4, so instead of multiplying the
+  // values by 1 or 4, we can left shift by 0 or 2 respectively, which is
+  // faster. (This doesn't apply to the last row where we can scale the values
+  // by 1, 2 or 3, so we keep the multiplication).
+  const int downsample_shift = downsample_factor >> 1;
+  const int16x8_t df_s16 = vdupq_n_s16(downsample_shift);
+  const int32x4_t df_s32 = vdupq_n_s32(downsample_shift);
+  const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+  // We use an intermediate matrix that will be transposed to get M.
+  int64_t M_trn[49];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int h = v_start;
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+      int j = 0;
+      while (j <= width - 8) {
+        int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+        // Load src and scale based on downsampling factor.
+        int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+        dgd0 = dgd1;
+        j += 8;
+      }
+      // Process remaining elements without Neon.
+      while (j < width) {
+        int16_t s = src_avg[j] * downsample_factor;
+        int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+        int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+        int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+        int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+        int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+        int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j];
+        int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j];
+
+        M_trn[row * wiener_win + 0] += d0 * s;
+        M_trn[row * wiener_win + 1] += d1 * s;
+        M_trn[row * wiener_win + 2] += d2 * s;
+        M_trn[row * wiener_win + 3] += d3 * s;
+        M_trn[row * wiener_win + 4] += d4 * s;
+        M_trn[row * wiener_win + 5] += d5 * s;
+        M_trn[row * wiener_win + 6] += d6 * s;
+
+        j++;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        int16x8_t dgd0[7];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+        dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+        dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+          dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+          dgd1[5] = vshlq_s16(dgd1[5], df_s16);
+          dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+          dgd1[6] = vshlq_s16(dgd1[6], df_s16);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+      j += 8;
+    }
+
+    if (j < width) {
+      // Process remaining columns using a mask to discard excess elements.
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[7];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[0] = vandq_s16(dgd0[0], mask);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[1] = vandq_s16(dgd0[1], mask);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[2] = vandq_s16(dgd0[2], mask);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[3] = vandq_s16(dgd0[3], mask);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+        dgd0[4] = vandq_s16(dgd0[4], mask);
+        dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+        dgd0[5] = vandq_s16(dgd0[5], mask);
+        dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+        dgd0[6] = vandq_s16(dgd0[6], mask);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+          dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+          dgd1[5] = vshlq_s16(dgd1[5], df_s16);
+          dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+          dgd1[6] = vshlq_s16(dgd1[6], df_s16);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+    }
+    dgd_avg += downsample_factor * dgd_avg_stride;
+    src_avg += src_avg_stride;
+    h += downsample_factor;
+  } while (h <= v_end - downsample_factor);
+
+  if (h < v_end) {
+    // The last row is scaled by a different downsample factor, so process
+    // separately.
+
+    // Cross-correlation (M).
+    for (int row = 0; row < 7; row++) {
+      int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+      int j = 0;
+      while (j <= width - 8) {
+        int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+        // Load src vector and scale based on downsampling factor.
+        int16x8_t s =
+            vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+        dgd0 = dgd1;
+        j += 8;
+      }
+      // Process remaining elements without Neon.
+      while (j < width) {
+        int16_t s = src_avg[j];
+        int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+        int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+        int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+        int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+        int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+        int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j];
+        int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j];
+
+        M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 5] += d5 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 6] += d6 * s * last_row_downsample_factor;
+
+        j++;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      int col0 = 0;
+      do {
+        // Load first column.
+        int16x8_t dgd0[7];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+        dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+        dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+
+        // Perform computation of the first column with itself (28 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+                                   last_row_downsample_factor);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+          dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+          dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor);
+          dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+          dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      } while (++col0 < wiener_win);
+      j += 8;
+    }
+
+    // Process remaining columns using a mask to discard excess elements.
+    if (j < width) {
+      int col0 = 0;
+      do {
+        // Load first column.
+        int16x8_t dgd0[7];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[0] = vandq_s16(dgd0[0], mask);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[1] = vandq_s16(dgd0[1], mask);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[2] = vandq_s16(dgd0[2], mask);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[3] = vandq_s16(dgd0[3], mask);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+        dgd0[4] = vandq_s16(dgd0[4], mask);
+        dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0);
+        dgd0[5] = vandq_s16(dgd0[5], mask);
+        dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0);
+        dgd0[6] = vandq_s16(dgd0[6], mask);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 7x7
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 7x7 matrices around H's
+        // diagonal.
+        compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+                                   last_row_downsample_factor);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[7];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+          dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1);
+          dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor);
+          dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1);
+          dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor);
+
+          // Compute all elements from the combination of both columns (49
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      } while (++col0 < wiener_win);
+    }
+  }
+
+  // Transpose M_trn.
+  transpose_M_win7(M, M_trn, 7);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, wiener_win2);
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win5_neon(int16_t *dgd_avg, int dgd_avg_stride,
+                                           int16_t *src_avg, int src_avg_stride,
+                                           int width, int v_start, int v_end,
+                                           int64_t *M, int64_t *H,
+                                           int downsample_factor,
+                                           int last_row_downsample_factor) {
+  const int wiener_win = 5;
+  const int wiener_win2 = wiener_win * wiener_win;
+  // The downsample factor can be either 1 or 4, so instead of multiplying the
+  // values by 1 or 4, we can left shift by 0 or 2 respectively, which is
+  // faster. (This doesn't apply to the last row where we can scale the values
+  // by 1, 2 or 3, so we keep the multiplication).
+  const int downsample_shift = downsample_factor >> 1;
+  const int16x8_t df_s16 = vdupq_n_s16(downsample_shift);
+  const int32x4_t df_s32 = vdupq_n_s32(downsample_shift);
+  const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8));
+
+  // We use an intermediate matrix that will be transposed to get M.
+  int64_t M_trn[25];
+  memset(M_trn, 0, sizeof(M_trn));
+
+  int h = v_start;
+  do {
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+      int j = 0;
+      while (j <= width - 8) {
+        int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+        // Load src vector and scale based on downsampling factor.
+        int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+        dgd0 = dgd1;
+        j += 8;
+      }
+
+      // Process remaining elements without Neon.
+      while (j < width) {
+        int16_t s = src_avg[j];
+        int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+        int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+        int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+        int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+        int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+
+        M_trn[row * wiener_win + 0] += d0 * s * downsample_factor;
+        M_trn[row * wiener_win + 1] += d1 * s * downsample_factor;
+        M_trn[row * wiener_win + 2] += d2 * s * downsample_factor;
+        M_trn[row * wiener_win + 3] += d3 * s * downsample_factor;
+        M_trn[row * wiener_win + 4] += d4 * s * downsample_factor;
+
+        j++;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a mask to discard excess elements.
+    if (j < width) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[0] = vandq_s16(dgd0[0], mask);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[1] = vandq_s16(dgd0[1], mask);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[2] = vandq_s16(dgd0[2], mask);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[3] = vandq_s16(dgd0[3], mask);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+        dgd0[4] = vandq_s16(dgd0[4], mask);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vshlq_s16(dgd1[0], df_s16);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vshlq_s16(dgd1[1], df_s16);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vshlq_s16(dgd1[2], df_s16);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vshlq_s16(dgd1[3], df_s16);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vshlq_s16(dgd1[4], df_s16);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+    }
+    dgd_avg += downsample_factor * dgd_avg_stride;
+    src_avg += src_avg_stride;
+    h += downsample_factor;
+  } while (h <= v_end - downsample_factor);
+
+  if (h < v_end) {
+    // The last row is scaled by a different downsample factor, so process
+    // separately.
+
+    // Cross-correlation (M).
+    for (int row = 0; row < wiener_win; row++) {
+      int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride);
+      int j = 0;
+      while (j <= width - 8) {
+        int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8);
+        // Load src vector and scale based on downsampling factor.
+        int16x8_t s =
+            vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor);
+
+        // Compute all the elements of one row of M.
+        compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row);
+
+        dgd0 = dgd1;
+        j += 8;
+      }
+
+      // Process remaining elements without Neon.
+      while (j < width) {
+        int16_t s = src_avg[j];
+        int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j];
+        int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j];
+        int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j];
+        int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j];
+        int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j];
+
+        M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor;
+        M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor;
+
+        j++;
+      }
+    }
+
+    // Auto-covariance (H).
+    int j = 0;
+    while (j <= width - 8) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+                                   last_row_downsample_factor);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+      j += 8;
+    }
+
+    // Process remaining columns using a mask to discard excess elements.
+    if (j < width) {
+      for (int col0 = 0; col0 < wiener_win; col0++) {
+        // Load first column.
+        int16x8_t dgd0[5];
+        dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0);
+        dgd0[0] = vandq_s16(dgd0[0], mask);
+        dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0);
+        dgd0[1] = vandq_s16(dgd0[1], mask);
+        dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0);
+        dgd0[2] = vandq_s16(dgd0[2], mask);
+        dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0);
+        dgd0[3] = vandq_s16(dgd0[3], mask);
+        dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0);
+        dgd0[4] = vandq_s16(dgd0[4], mask);
+
+        // Perform computation of the first column with itself (15 elements).
+        // For the first column this will fill the upper triangle of the 5x5
+        // matrix at the top left of the H matrix. For the next columns this
+        // will fill the upper triangle of the other 5x5 matrices around H's
+        // diagonal.
+        compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2,
+                                   last_row_downsample_factor);
+
+        // All computation next to the matrix diagonal has already been done.
+        for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+          // Load second column and scale based on downsampling factor.
+          int16x8_t dgd1[5];
+          dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1);
+          dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor);
+          dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1);
+          dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor);
+          dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1);
+          dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor);
+          dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1);
+          dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor);
+          dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1);
+          dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor);
+
+          // Compute all elements from the combination of both columns (25
+          // elements).
+          compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win,
+                             wiener_win2);
+        }
+      }
+    }
+  }
+
+  // Transpose M_trn.
+  transpose_M_win5(M, M_trn, 5);
+
+  // Copy upper triangle of H in the lower one.
+  copy_upper_triangle(H, wiener_win2);
+}
+
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd,
+                            const uint8_t *src, int16_t *dgd_avg,
+                            int16_t *src_avg, int h_start, int h_end,
+                            int v_start, int v_end, int dgd_stride,
+                            int src_stride, int64_t *M, int64_t *H,
+                            int use_downsampled_wiener_stats) {
+  assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = wiener_win >> 1;
+  const int32_t width = h_end - h_start;
+  const int32_t height = v_end - v_start;
+  const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+  uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
+  assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+  int dgd_avg_stride = width + 2 * wiener_halfwin;
+  int src_avg_stride = width;
+
+  // Compute (dgd - avg) and store it in dgd_avg.
+  // The wiener window will slide along the dgd frame, centered on each pixel.
+  // For the top left pixel and all the pixels on the side of the frame this
+  // means half of the window will be outside of the frame. As such the actual
+  // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+  // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+  const int vert_offset = v_start - wiener_halfwin;
+  const int horiz_offset = h_start - wiener_halfwin;
+  const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+  compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
+                  width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
+
+  // Compute (src - avg), downsample if necessary and store in src-avg.
+  const uint8_t *src_start = src + h_start + v_start * src_stride;
+  compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg,
+                  src_avg_stride, width, height, downsample_factor);
+
+  // Since the height is not necessarily a multiple of the downsample factor,
+  // the last line of src will be scaled according to how many rows remain.
+  int last_row_downsample_factor =
+      use_downsampled_wiener_stats ? height % downsample_factor : 1;
+
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+                            width, v_start, v_end, M, H, downsample_factor,
+                            last_row_downsample_factor);
+  } else {
+    compute_stats_win5_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+                            width, v_start, v_end, M, H, downsample_factor,
+                            last_row_downsample_factor);
+  }
+}
+
+static INLINE void calc_proj_params_r0_r1_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t h01_lo = vdupq_n_s64(0);
+  int64x2_t h01_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint8_t *src_ptr = src8;
+    const uint8_t *dat_ptr = dat8;
+    int32_t *flt0_ptr = flt0;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint8x8_t s = vld1_u8(src_ptr);
+      uint8x8_t d = vld1_u8(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+      f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+      f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+      f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+      f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+      h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+      h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+      h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src8 += src_stride;
+    dat8 += dat_stride;
+    flt0 += flt0_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  H[1][0] = H[0][1];
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt0, int flt0_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+
+  int64x2_t h00_lo = vdupq_n_s64(0);
+  int64x2_t h00_hi = vdupq_n_s64(0);
+  int64x2_t c0_lo = vdupq_n_s64(0);
+  int64x2_t c0_hi = vdupq_n_s64(0);
+
+  do {
+    const uint8_t *src_ptr = src8;
+    const uint8_t *dat_ptr = dat8;
+    int32_t *flt0_ptr = flt0;
+    int w = width;
+
+    do {
+      uint8x8_t s = vld1_u8(src_ptr);
+      uint8x8_t d = vld1_u8(dat_ptr);
+      int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+      int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+      f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+      f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+
+      h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+      h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+      h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+      h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+      c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+      c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+      c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+      c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt0_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src8 += src_stride;
+    dat8 += dat_stride;
+    flt0 += flt0_stride;
+  } while (--height != 0);
+
+  H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+  C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width,
+                                            int height, int src_stride,
+                                            const uint8_t *dat8, int dat_stride,
+                                            int32_t *flt1, int flt1_stride,
+                                            int64_t H[2][2], int64_t C[2]) {
+  assert(width % 8 == 0);
+  const int size = width * height;
+
+  int64x2_t h11_lo = vdupq_n_s64(0);
+  int64x2_t h11_hi = vdupq_n_s64(0);
+  int64x2_t c1_lo = vdupq_n_s64(0);
+  int64x2_t c1_hi = vdupq_n_s64(0);
+
+  do {
+    const uint8_t *src_ptr = src8;
+    const uint8_t *dat_ptr = dat8;
+    int32_t *flt1_ptr = flt1;
+    int w = width;
+
+    do {
+      uint8x8_t s = vld1_u8(src_ptr);
+      uint8x8_t d = vld1_u8(dat_ptr);
+      int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+      int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+      int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+      int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+      int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+      int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+      f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+      f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+      h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+      h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+      h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+      h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+      c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+      c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+      c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+      c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+      src_ptr += 8;
+      dat_ptr += 8;
+      flt1_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src8 += src_stride;
+    dat8 += dat_stride;
+    flt1 += flt1_stride;
+  } while (--height != 0);
+
+  H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+  C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+//    of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+//    non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+//    non-zero and need to be computed.
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
+                               int src_stride, const uint8_t *dat8,
+                               int dat_stride, int32_t *flt0, int flt0_stride,
+                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
+                               int64_t C[2], const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+                                dat_stride, flt0, flt0_stride, flt1,
+                                flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
+                             flt1, flt1_stride, H, C);
+  }
+}
diff --git a/av1/encoder/arm/neon/pickrst_neon.h b/av1/encoder/arm/neon/pickrst_neon.h
new file mode 100644
index 0000000..d9a9ad4
--- /dev/null
+++ b/av1/encoder/arm/neon/pickrst_neon.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+// When we load 8 values of int16_t type and need less than 8 values for
+// processing, the below mask is used to make the extra values zero.
+extern const int16_t av1_neon_mask_16bit[16];
+
+static INLINE void copy_upper_triangle(int64_t *H, const int wiener_win2) {
+  for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
+    // Transpose the first 2x2 square. It needs a special case as the element
+    // of the bottom left is on the diagonal.
+    int64x2_t row0 = vld1q_s64(H + i * wiener_win2 + i + 1);
+    int64x2_t row1 = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1);
+
+    int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
+
+    vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
+    vst1q_s64(H + (i + 2) * wiener_win2 + i, tr_row);
+
+    // Transpose and store all the remaining 2x2 squares of the line.
+    for (int j = i + 3; j < wiener_win2; j = j + 2) {
+      row0 = vld1q_s64(H + i * wiener_win2 + j);
+      row1 = vld1q_s64(H + (i + 1) * wiener_win2 + j);
+
+      int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
+      int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
+
+      vst1q_s64(H + j * wiener_win2 + i, tr_row0);
+      vst1q_s64(H + (j + 1) * wiener_win2 + i, tr_row1);
+    }
+  }
+}
+
+static INLINE void transpose_M_win5(int64_t *M, int64_t *M_trn,
+                                    const int wiener_win) {
+  // 1st and 2nd rows.
+  int64x2_t row00 = vld1q_s64(M_trn);
+  int64x2_t row10 = vld1q_s64(M_trn + wiener_win);
+  vst1q_s64(M, aom_vtrn1q_s64(row00, row10));
+  vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10));
+
+  int64x2_t row02 = vld1q_s64(M_trn + 2);
+  int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2);
+  vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12));
+  vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12));
+
+  // Last column only needs trn2.
+  int64x2_t row03 = vld1q_s64(M_trn + 3);
+  int64x2_t row13 = vld1q_s64(M_trn + wiener_win + 3);
+  vst1q_s64(M + 4 * wiener_win, aom_vtrn2q_s64(row03, row13));
+
+  // 3rd and 4th rows.
+  int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win);
+  int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win);
+  vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30));
+  vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30));
+
+  int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2);
+  int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2);
+  vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32));
+  vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32));
+
+  // Last column only needs trn2.
+  int64x2_t row23 = vld1q_s64(M_trn + 2 * wiener_win + 3);
+  int64x2_t row33 = vld1q_s64(M_trn + 3 * wiener_win + 3);
+  vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn2q_s64(row23, row33));
+
+  // Last row.
+  int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win);
+  vst1_s64(M + 4, vget_low_s64(row40));
+  vst1_s64(M + 1 * wiener_win + 4, vget_high_s64(row40));
+
+  int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2);
+  vst1_s64(M + 2 * wiener_win + 4, vget_low_s64(row42));
+  vst1_s64(M + 3 * wiener_win + 4, vget_high_s64(row42));
+
+  // Element on the bottom right of M_trn is copied as is.
+  vst1_s64(M + 4 * wiener_win + 4, vld1_s64(M_trn + 4 * wiener_win + 4));
+}
+
+static INLINE void transpose_M_win7(int64_t *M, int64_t *M_trn,
+                                    const int wiener_win) {
+  // 1st and 2nd rows.
+  int64x2_t row00 = vld1q_s64(M_trn);
+  int64x2_t row10 = vld1q_s64(M_trn + wiener_win);
+  vst1q_s64(M, aom_vtrn1q_s64(row00, row10));
+  vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10));
+
+  int64x2_t row02 = vld1q_s64(M_trn + 2);
+  int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2);
+  vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12));
+  vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12));
+
+  int64x2_t row04 = vld1q_s64(M_trn + 4);
+  int64x2_t row14 = vld1q_s64(M_trn + wiener_win + 4);
+  vst1q_s64(M + 4 * wiener_win, aom_vtrn1q_s64(row04, row14));
+  vst1q_s64(M + 5 * wiener_win, aom_vtrn2q_s64(row04, row14));
+
+  // Last column only needs trn2.
+  int64x2_t row05 = vld1q_s64(M_trn + 5);
+  int64x2_t row15 = vld1q_s64(M_trn + wiener_win + 5);
+  vst1q_s64(M + 6 * wiener_win, aom_vtrn2q_s64(row05, row15));
+
+  // 3rd and 4th rows.
+  int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win);
+  int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win);
+  vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30));
+  vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30));
+
+  int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2);
+  int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2);
+  vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32));
+  vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32));
+
+  int64x2_t row24 = vld1q_s64(M_trn + 2 * wiener_win + 4);
+  int64x2_t row34 = vld1q_s64(M_trn + 3 * wiener_win + 4);
+  vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn1q_s64(row24, row34));
+  vst1q_s64(M + 5 * wiener_win + 2, aom_vtrn2q_s64(row24, row34));
+
+  // Last column only needs trn2.
+  int64x2_t row25 = vld1q_s64(M_trn + 2 * wiener_win + 5);
+  int64x2_t row35 = vld1q_s64(M_trn + 3 * wiener_win + 5);
+  vst1q_s64(M + 6 * wiener_win + 2, aom_vtrn2q_s64(row25, row35));
+
+  // 5th and 6th rows.
+  int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win);
+  int64x2_t row50 = vld1q_s64(M_trn + 5 * wiener_win);
+  vst1q_s64(M + 4, aom_vtrn1q_s64(row40, row50));
+  vst1q_s64(M + wiener_win + 4, aom_vtrn2q_s64(row40, row50));
+
+  int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2);
+  int64x2_t row52 = vld1q_s64(M_trn + 5 * wiener_win + 2);
+  vst1q_s64(M + 2 * wiener_win + 4, aom_vtrn1q_s64(row42, row52));
+  vst1q_s64(M + 3 * wiener_win + 4, aom_vtrn2q_s64(row42, row52));
+
+  int64x2_t row44 = vld1q_s64(M_trn + 4 * wiener_win + 4);
+  int64x2_t row54 = vld1q_s64(M_trn + 5 * wiener_win + 4);
+  vst1q_s64(M + 4 * wiener_win + 4, aom_vtrn1q_s64(row44, row54));
+  vst1q_s64(M + 5 * wiener_win + 4, aom_vtrn2q_s64(row44, row54));
+
+  // Last column only needs trn2.
+  int64x2_t row45 = vld1q_s64(M_trn + 4 * wiener_win + 5);
+  int64x2_t row55 = vld1q_s64(M_trn + 5 * wiener_win + 5);
+  vst1q_s64(M + 6 * wiener_win + 4, aom_vtrn2q_s64(row45, row55));
+
+  // Last row.
+  int64x2_t row60 = vld1q_s64(M_trn + 6 * wiener_win);
+  vst1_s64(M + 6, vget_low_s64(row60));
+  vst1_s64(M + 1 * wiener_win + 6, vget_high_s64(row60));
+
+  int64x2_t row62 = vld1q_s64(M_trn + 6 * wiener_win + 2);
+  vst1_s64(M + 2 * wiener_win + 6, vget_low_s64(row62));
+  vst1_s64(M + 3 * wiener_win + 6, vget_high_s64(row62));
+
+  int64x2_t row64 = vld1q_s64(M_trn + 6 * wiener_win + 4);
+  vst1_s64(M + 4 * wiener_win + 6, vget_low_s64(row64));
+  vst1_s64(M + 5 * wiener_win + 6, vget_high_s64(row64));
+
+  // Element on the bottom right of M_trn is copied as is.
+  vst1_s64(M + 6 * wiener_win + 6, vld1_s64(M_trn + 6 * wiener_win + 6));
+}
+
+static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd0,
+                                          int16x8_t dgd1, int64_t *M,
+                                          const int wiener_win, int row) {
+  int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0);
+  int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0));
+  m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0));
+
+  int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1);
+  int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01));
+  m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01));
+
+  m0 = horizontal_add_2d_s32(m0, m1);
+  m_01 = vpadalq_s32(m_01, m0);
+  vst1q_s64(M + row * wiener_win + 0, m_01);
+
+  int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2);
+  int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02));
+  m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02));
+
+  int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3);
+  int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03));
+  m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03));
+
+  m2 = horizontal_add_2d_s32(m2, m3);
+  m_23 = vpadalq_s32(m_23, m2);
+  vst1q_s64(M + row * wiener_win + 2, m_23);
+
+  int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4);
+  int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04));
+  m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04));
+  M[row * wiener_win + 4] += horizontal_long_add_s32x4(m4);
+}
+
+static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd0,
+                                          int16x8_t dgd1, int64_t *M,
+                                          const int wiener_win, int row) {
+  int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0);
+  int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2);
+  int64x2_t m_45 = vld1q_s64(M + row * wiener_win + 4);
+
+  int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0));
+  m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0));
+
+  int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1);
+  int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01));
+  m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01));
+
+  m0 = horizontal_add_2d_s32(m0, m1);
+  m_01 = vpadalq_s32(m_01, m0);
+  vst1q_s64(M + row * wiener_win + 0, m_01);
+
+  int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2);
+  int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02));
+  m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02));
+
+  int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3);
+  int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03));
+  m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03));
+
+  m2 = horizontal_add_2d_s32(m2, m3);
+  m_23 = vpadalq_s32(m_23, m2);
+  vst1q_s64(M + row * wiener_win + 2, m_23);
+
+  int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4);
+  int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04));
+  m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04));
+
+  int16x8_t dgd05 = vextq_s16(dgd0, dgd1, 5);
+  int32x4_t m5 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd05));
+  m5 = vmlal_s16(m5, vget_high_s16(src), vget_high_s16(dgd05));
+
+  m4 = horizontal_add_2d_s32(m4, m5);
+  m_45 = vpadalq_s32(m_45, m4);
+  vst1q_s64(M + row * wiener_win + 4, m_45);
+
+  int16x8_t dgd06 = vextq_s16(dgd0, dgd1, 6);
+  int32x4_t m6 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd06));
+  m6 = vmlal_s16(m6, vget_high_s16(src), vget_high_s16(dgd06));
+  M[row * wiener_win + 6] += horizontal_long_add_s32x4(m6);
+}
+
+static INLINE void compute_H_two_cols(int16x8_t *dgd0, int16x8_t *dgd1,
+                                      int col0, int col1, int64_t *H,
+                                      const int wiener_win,
+                                      const int wiener_win2) {
+  for (int row0 = 0; row0 < wiener_win; row0++) {
+    for (int row1 = 0; row1 < wiener_win; row1++) {
+      int auto_cov_idx =
+          (col0 * wiener_win + row0) * wiener_win2 + (col1 * wiener_win) + row1;
+
+      int32x4_t auto_cov =
+          vmull_s16(vget_low_s16(dgd0[row0]), vget_low_s16(dgd1[row1]));
+      auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd0[row0]),
+                           vget_high_s16(dgd1[row1]));
+
+      H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov);
+    }
+  }
+}
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
diff --git a/av1/encoder/arm/neon/picksrt_neon.c b/av1/encoder/arm/neon/picksrt_neon.c
deleted file mode 100644
index 1346d6b..0000000
--- a/av1/encoder/arm/neon/picksrt_neon.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2020, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <math.h>
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/restoration.h"
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-int64_t av1_lowbd_pixel_proj_error_neon(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
-  int i, j, k;
-  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
-  const int32x4_t zero = vdupq_n_s32(0);
-  uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-
-  int64_t err = 0;
-  if (params->r[0] > 0 && params->r[1] > 0) {
-    for (i = 0; i < height; ++i) {
-      int32x4_t err0 = zero;
-      for (j = 0; j <= width - 8; j += 8) {
-        const uint8x8_t d0 = vld1_u8(&dat[j]);
-        const uint8x8_t s0 = vld1_u8(&src[j]);
-        const int16x8_t flt0_16b =
-            vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
-                         vqmovn_s32(vld1q_s32(&flt0[j + 4])));
-        const int16x8_t flt1_16b =
-            vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
-                         vqmovn_s32(vld1q_s32(&flt1[j + 4])));
-        const int16x8_t u0 =
-            vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
-        const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
-        const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
-        const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
-        const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
-        const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
-        const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
-
-        int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
-        v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
-        int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
-        v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
-        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
-        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
-        const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
-                                       vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
-        const int16x4_t e0_lo = vget_low_s16(e0);
-        const int16x4_t e0_hi = vget_high_s16(e0);
-        err0 = vmlal_s16(err0, e0_lo, e0_lo);
-        err0 = vmlal_s16(err0, e0_hi, e0_hi);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
-        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt0 += flt0_stride;
-      flt1 += flt1_stride;
-      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
-    }
-
-  } else if (params->r[0] > 0 || params->r[1] > 0) {
-    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
-    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
-    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
-    for (i = 0; i < height; ++i) {
-      int32x4_t err0 = zero;
-      for (j = 0; j <= width - 8; j += 8) {
-        const uint8x8_t d0 = vld1_u8(&dat[j]);
-        const uint8x8_t s0 = vld1_u8(&src[j]);
-        const uint16x8_t d0s0 = vsubl_u8(d0, s0);
-        const uint16x8x2_t d0w =
-            vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
-
-        const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
-        const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
-
-        int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
-        v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
-                         xq_active * (1 << SGRPROJ_RST_BITS));
-        int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
-        v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
-                         xq_active * (1 << SGRPROJ_RST_BITS));
-        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
-        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
-        const int16x8_t e0 =
-            vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
-        const int16x4_t e0_lo = vget_low_s16(e0);
-        const int16x4_t e0_hi = vget_high_s16(e0);
-        err0 = vmlal_s16(err0, e0_lo, e0_lo);
-        err0 = vmlal_s16(err0, e0_hi, e0_hi);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
-        int32_t v = xq_active * (flt[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt += flt_stride;
-      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
-    }
-  } else {
-    uint32x4_t err0 = vreinterpretq_u32_s32(zero);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j <= width - 16; j += 16) {
-        const uint8x16_t d = vld1q_u8(&dat[j]);
-        const uint8x16_t s = vld1q_u8(&src[j]);
-        const uint8x16_t diff = vabdq_u8(d, s);
-        const uint8x8_t diff0 = vget_low_u8(diff);
-        const uint8x8_t diff1 = vget_high_u8(diff);
-        err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
-        err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t e = dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-    }
-    sum64 = vpaddlq_u32(err0);
-  }
-#if AOM_ARCH_AARCH64
-  err += vaddvq_u64(sum64);
-#else
-  err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
-#endif  // AOM_ARCH_AARCH64
-  return err;
-}
diff --git a/av1/encoder/arm/neon/reconinter_enc_neon.c b/av1/encoder/arm/neon/reconinter_enc_neon.c
index e5975b0..03afa30 100644
--- a/av1/encoder/arm/neon/reconinter_enc_neon.c
+++ b/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -94,13 +94,13 @@
   } else if (!subpel_y_q3) {
     const int16_t *const filter_x =
         av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
-    aom_convolve8_horiz_neon(ref, ref_stride, comp_pred, width, filter_x, 16,
-                             NULL, -1, width, height);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL,
+                        -1, width, height);
   } else if (!subpel_x_q3) {
     const int16_t *const filter_y =
         av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
-    aom_convolve8_vert_neon(ref, ref_stride, comp_pred, width, NULL, -1,
-                            filter_y, 16, width, height);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y,
+                       16, width, height);
   } else {
     DECLARE_ALIGNED(16, uint8_t,
                     im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
@@ -117,11 +117,10 @@
     const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1);
 
     assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz_neon(ref - ref_vert_offset, ref_stride, im_block,
-                             MAX_SB_SIZE, filter_x, 16, NULL, -1, width,
-                             im_height);
-    aom_convolve8_vert_neon(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred,
-                            width, NULL, -1, filter_y, 16, width, height);
+    aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block,
+                        MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height);
+    aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width,
+                       NULL, -1, filter_y, 16, width, height);
   }
 }
 
@@ -138,3 +137,153 @@
 
   aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width);
 }
+
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                          subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                          subpel_search);
+
+  aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred,
+                                  width, jcp_param);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width > 4) {
+      assert(width % 8 == 0);
+      int i = height;
+      do {
+        int j = 0;
+        do {
+          uint16x8_t r = vld1q_u16(ref + j);
+          vst1q_u16(comp_pred + j, r);
+          j += 8;
+        } while (j < width);
+        ref += ref_stride;
+        comp_pred += width;
+      } while (--i != 0);
+    } else if (width == 4) {
+      int i = height;
+      do {
+        uint16x4_t r = vld1_u16(ref);
+        vst1_u16(comp_pred, r);
+        ref += ref_stride;
+        comp_pred += width;
+      } while (--i != 0);
+    } else {
+      assert(width == 2);
+      int i = height / 2;
+      do {
+        uint16x4_t r = load_u16_2x2(ref, ref_stride);
+        store_u16_2x1(comp_pred + 0 * width, r, 0);
+        store_u16_2x1(comp_pred + 1 * width, r, 1);
+        ref += 2 * ref_stride;
+        comp_pred += 2 * width;
+      } while (--i != 0);
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel,
+                                    16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL,
+                                   -1, kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz_neon(
+        ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride,
+        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+        intermediate_height, bd);
+    aom_highbd_convolve8_vert_neon(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_neon(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                                 height, subpel_x_q3, subpel_y_q3, ref8,
+                                 ref_stride, bd, subpel_search);
+
+  aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8,
+                                width);
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                                 height, subpel_x_q3, subpel_y_q3, ref8,
+                                 ref_stride, bd, subpel_search);
+
+  aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height,
+                                         comp_pred8, width, jcp_param);
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/arm/neon/shift_neon.h b/av1/encoder/arm/neon/shift_neon.h
new file mode 100644
index 0000000..d73aef2
--- /dev/null
+++ b/av1/encoder/arm/neon/shift_neon.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"  // For AOM_INLINE.
+
+#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg)                \
+  static AOM_INLINE void name(const type *in, type *out, int size) { \
+    int i = 0;                                                       \
+    do {                                                             \
+      out[i] = intrinsic(in[i], arg);                                \
+    } while (++i < size);                                            \
+  }
+
+SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4)
+SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4)
+
+// Addition instructions have slightly better performance compared to shift
+// instructions on some micro-architectures, so use these for shifts by one.
+
+SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i])
+SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i])
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16,
+                  vdup_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16,
+                  vdupq_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32,
+                  vdupq_n_s32(0))
+
+#undef SHIFT_LOOP_HELPER
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
diff --git a/av1/encoder/arm/neon/temporal_filter_neon.c b/av1/encoder/arm/neon/temporal_filter_neon.c
index 163768b..986f143 100644
--- a/av1/encoder/arm/neon/temporal_filter_neon.c
+++ b/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -22,179 +22,6 @@
 // For the squared error buffer, add padding for 4 samples.
 #define SSE_STRIDE (BW + 4)
 
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-// clang-format off
-
-DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
-  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
-  0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
-  0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
-  0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
-};
-
-// clang-format on
-
-static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
-                                const uint8_t *frame2, const uint32_t stride2,
-                                const uint32_t block_width,
-                                const uint32_t block_height,
-                                uint8_t *frame_abs_diff,
-                                const unsigned int dst_stride) {
-  uint8_t *dst = frame_abs_diff;
-
-  uint32_t i = 0;
-  do {
-    uint32_t j = 0;
-    do {
-      uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
-      uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
-      uint8x16_t abs_diff = vabdq_u8(s, r);
-      vst1q_u8(dst + j + 2, abs_diff);
-      j += 16;
-    } while (j < block_width);
-
-    dst += dst_stride;
-    i++;
-  } while (i < block_height);
-}
-
-static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
-                                      const uint32_t block_width) {
-  uint8x8_t s = vld1_u8(src);
-
-  if (col == 0) {
-    const uint8_t lane2 = vget_lane_u8(s, 2);
-    s = vset_lane_u8(lane2, s, 0);
-    s = vset_lane_u8(lane2, s, 1);
-  } else if (col >= block_width - 4) {
-    const uint8_t lane5 = vget_lane_u8(s, 5);
-    s = vset_lane_u8(lane5, s, 6);
-    s = vset_lane_u8(lane5, s, 7);
-  }
-  return vcombine_u8(s, s);
-}
-
-static void apply_temporal_filter(
-    const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
-    const uint32_t block_height, const int *subblock_mses,
-    unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
-    const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
-    const double decay_factor, const double inv_factor,
-    const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
-  assert(((block_width == 16) || (block_width == 32)) &&
-         ((block_height == 16) || (block_height == 32)));
-
-  uint32_t acc_5x5_neon[BH][BW];
-  const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
-
-  // Traverse 4 columns at a time - first and last two columns need padding.
-  for (uint32_t col = 0; col < block_width; col += 4) {
-    uint8x16_t vsrc[5][2];
-    const uint8_t *src = frame_abs_diff + col;
-
-    // Load, pad (for first and last two columns) and mask 3 rows from the top.
-    for (int i = 2; i < 5; i++) {
-      const uint8x16_t s = load_and_pad(src, col, block_width);
-      vsrc[i][0] = vandq_u8(s, vmask.val[0]);
-      vsrc[i][1] = vandq_u8(s, vmask.val[1]);
-      src += SSE_STRIDE;
-    }
-
-    // Pad the top 2 rows.
-    vsrc[0][0] = vsrc[2][0];
-    vsrc[0][1] = vsrc[2][1];
-    vsrc[1][0] = vsrc[2][0];
-    vsrc[1][1] = vsrc[2][1];
-
-    for (unsigned int row = 0; row < block_height; row++) {
-      uint32x4_t sum_01 = vdupq_n_u32(0);
-      uint32x4_t sum_23 = vdupq_n_u32(0);
-
-      sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
-      sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
-      sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
-      sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
-      sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
-
-      sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
-      sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
-      sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
-      sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
-      sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
-
-      vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
-
-      // Push all rows in the sliding window up one.
-      for (int i = 0; i < 4; i++) {
-        vsrc[i][0] = vsrc[i + 1][0];
-        vsrc[i][1] = vsrc[i + 1][1];
-      }
-
-      if (row <= block_height - 4) {
-        // Load next row into the bottom of the sliding window.
-        uint8x16_t s = load_and_pad(src, col, block_width);
-        vsrc[4][0] = vandq_u8(s, vmask.val[0]);
-        vsrc[4][1] = vandq_u8(s, vmask.val[1]);
-        src += SSE_STRIDE;
-      } else {
-        // Pad the bottom 2 rows.
-        vsrc[4][0] = vsrc[3][0];
-        vsrc[4][1] = vsrc[3][1];
-      }
-    }
-  }
-
-  // Perform filtering.
-  if (tf_wgt_calc_lvl == 0) {
-    for (unsigned int i = 0, k = 0; i < block_height; i++) {
-      for (unsigned int j = 0; j < block_width; j++, k++) {
-        const int pixel_value = frame[i * stride + j];
-        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
-
-        const double window_error = diff_sse * inv_num_ref_pixels;
-        const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
-        const double block_error = (double)subblock_mses[subblock_idx];
-        const double combined_error =
-            weight_factor * window_error + block_error * inv_factor;
-        // Compute filter weight.
-        double scaled_error =
-            combined_error * d_factor[subblock_idx] * decay_factor;
-        scaled_error = AOMMIN(scaled_error, 7);
-        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
-        accumulator[k] += weight * pixel_value;
-        count[k] += weight;
-      }
-    }
-  } else {
-    for (unsigned int i = 0, k = 0; i < block_height; i++) {
-      for (unsigned int j = 0; j < block_width; j++, k++) {
-        const int pixel_value = frame[i * stride + j];
-        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
-
-        const double window_error = diff_sse * inv_num_ref_pixels;
-        const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
-        const double block_error = (double)subblock_mses[subblock_idx];
-        const double combined_error =
-            weight_factor * window_error + block_error * inv_factor;
-        // Compute filter weight.
-        double scaled_error =
-            combined_error * d_factor[subblock_idx] * decay_factor;
-        scaled_error = AOMMIN(scaled_error, 7);
-        const float fweight =
-            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
-        const int weight = iroundpf(fweight);
-        accumulator[k] += weight * pixel_value;
-        count[k] += weight;
-      }
-    }
-  }
-}
-
-#else  // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
-
 // When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
 DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
   0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
@@ -230,8 +57,7 @@
     } while (j < block_width);
 
     dst += dst_stride;
-    i++;
-  } while (i < block_height);
+  } while (++i < block_height);
 }
 
 static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
@@ -351,8 +177,6 @@
   }
 }
 
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
 void av1_apply_temporal_filter_neon(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -393,11 +217,7 @@
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
   double d_factor[4] = { 0 };
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-  uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
-#else   // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
   uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
   uint32_t luma_sse_sum[BW * BH] = { 0 };
 
   for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
@@ -436,32 +256,6 @@
     // search is only done on Y-plane, so the information from Y-plane
     // will be more accurate. The luma sse sum is reused in both chroma
     // planes.
-#if AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-    if (plane == AOM_PLANE_U) {
-      for (unsigned int i = 0; i < plane_h; i++) {
-        for (unsigned int j = 0; j < plane_w; j++) {
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              luma_sse_sum[i * BW + j] +=
-                  (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
-                   frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
-            }
-          }
-        }
-      }
-    }
-
-    get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
-                 plane_h, frame_abs_diff, SSE_STRIDE);
-
-    apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
-                          subblock_mses, accum + plane_offset,
-                          count + plane_offset, frame_abs_diff, luma_sse_sum,
-                          inv_num_ref_pixels, decay_factor, inv_factor,
-                          weight_factor, d_factor, tf_wgt_calc_lvl);
-#else   // !(AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD))
     if (plane == AOM_PLANE_U) {
       for (unsigned int i = 0; i < plane_h; i++) {
         for (unsigned int j = 0; j < plane_w; j++) {
@@ -484,8 +278,271 @@
                           count + plane_offset, frame_sse, luma_sse_sum,
                           inv_num_ref_pixels, decay_factor, inv_factor,
                           weight_factor, d_factor, tf_wgt_calc_lvl);
-#endif  // AOM_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
     plane_offset += plane_h * plane_w;
   }
 }
+
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
+                                                 int width, int stride,
+                                                 int edge_thresh) {
+  uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+  uint32x4_t acc = vdupq_n_u32(0);
+  // Count is in theory positive as it counts the number of times we're under
+  // the threshold, but it will be counted negatively in order to make best use
+  // of the vclt instruction, which sets every bit of a lane to 1 when the
+  // condition is true.
+  int32x4_t count = vdupq_n_s32(0);
+  int final_count = 0;
+  int64_t final_acc = 0;
+  const uint8_t *src_start = src + stride + 1;
+  int h = 1;
+
+  do {
+    int w = 1;
+    const uint8_t *src_ptr = src_start;
+
+    while (w <= (width - 1) - 16) {
+      uint8x16_t mat[3][3];
+      mat[0][0] = vld1q_u8(src_ptr - stride - 1);
+      mat[0][1] = vld1q_u8(src_ptr - stride);
+      mat[0][2] = vld1q_u8(src_ptr - stride + 1);
+      mat[1][0] = vld1q_u8(src_ptr - 1);
+      mat[1][1] = vld1q_u8(src_ptr);
+      mat[1][2] = vld1q_u8(src_ptr + 1);
+      mat[2][0] = vld1q_u8(src_ptr + stride - 1);
+      mat[2][1] = vld1q_u8(src_ptr + stride);
+      mat[2][2] = vld1q_u8(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa_lo =
+          vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0]));
+      uint16x8_t gxa_hi =
+          vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0]));
+      uint16x8_t gxb_lo =
+          vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2]));
+      uint16x8_t gxb_hi =
+          vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2]));
+      gxa_lo = vaddq_u16(
+          gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0])));
+      gxa_hi = vaddq_u16(
+          gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0])));
+      gxb_lo = vaddq_u16(
+          gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2])));
+      gxb_hi = vaddq_u16(
+          gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2])));
+
+      uint16x8_t gya_lo =
+          vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+      uint16x8_t gya_hi =
+          vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+      uint16x8_t gyb_lo =
+          vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+      uint16x8_t gyb_hi =
+          vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+      gya_lo = vaddq_u16(
+          gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1])));
+      gya_hi = vaddq_u16(
+          gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1])));
+      gyb_lo = vaddq_u16(
+          gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1])));
+      gyb_hi = vaddq_u16(
+          gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1])));
+
+      uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo);
+      uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi);
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh);
+      uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh);
+
+      uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2);
+      uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2);
+
+      uint16x8_t adj0_lo =
+          vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1]));
+      uint16x8_t adj0_hi =
+          vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1]));
+      uint16x8_t adj1_lo =
+          vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2]));
+      uint16x8_t adj1_hi =
+          vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2]));
+      uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo);
+      adj_lo = vaddq_u16(adj_lo, adj_lo);
+      uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi);
+      adj_hi = vaddq_u16(adj_hi, adj_hi);
+
+      uint16x8_t diag0_lo =
+          vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+      uint16x8_t diag0_hi =
+          vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+      uint16x8_t diag1_lo =
+          vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+      uint16x8_t diag1_hi =
+          vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+      uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo);
+      uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi);
+
+      uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo);
+      v_lo = vabdq_u16(v_lo, adj_lo);
+      uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi);
+      v_hi = vabdq_u16(v_hi, adj_hi);
+
+      acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo));
+      acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi));
+
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo));
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi));
+
+      w += 16;
+      src_ptr += 16;
+    }
+
+    if (w <= (width - 1) - 8) {
+      uint8x8_t mat[3][3];
+      mat[0][0] = vld1_u8(src_ptr - stride - 1);
+      mat[0][1] = vld1_u8(src_ptr - stride);
+      mat[0][2] = vld1_u8(src_ptr - stride + 1);
+      mat[1][0] = vld1_u8(src_ptr - 1);
+      mat[1][1] = vld1_u8(src_ptr);
+      mat[1][2] = vld1_u8(src_ptr + 1);
+      mat[2][0] = vld1_u8(src_ptr + stride - 1);
+      mat[2][1] = vld1_u8(src_ptr + stride);
+      mat[2][2] = vld1_u8(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+      uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vaddq_u16(center, diag);
+      v = vabdq_u16(v, adj);
+
+      acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 8;
+      src_ptr += 8;
+    }
+
+    if (w <= (width - 1) - 4) {
+      uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
+      uint8x8_t mat[3][3];
+      mat[0][0] = load_u8_4x1(src_ptr - stride - 1);
+      mat[0][1] = load_u8_4x1(src_ptr - stride);
+      mat[0][2] = load_u8_4x1(src_ptr - stride + 1);
+      mat[1][0] = load_u8_4x1(src_ptr - 1);
+      mat[1][1] = load_u8_4x1(src_ptr);
+      mat[1][2] = load_u8_4x1(src_ptr + 1);
+      mat[2][0] = load_u8_4x1(src_ptr + stride - 1);
+      mat[2][1] = load_u8_4x1(src_ptr + stride);
+      mat[2][2] = load_u8_4x1(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+      uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+      gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+      gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+      uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+      gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+      gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+      uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+      // Check which vector elements are under the threshold. The Laplacian is
+      // then unconditionally computed and we accumulate zeros if we're not
+      // under the threshold. This is much faster than using an if statement.
+      uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask);
+
+      uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+      uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+      uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+      uint16x8_t adj = vaddq_u16(adj0, adj1);
+      adj = vaddq_u16(adj, adj);
+
+      uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+      uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+      uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+      uint16x8_t v = vaddq_u16(center, diag);
+      v = vabdq_u16(v, adj);
+
+      acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+      // Add -1 for each lane where the gradient is under the threshold.
+      count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+      w += 4;
+      src_ptr += 4;
+    }
+
+    while (w < width - 1) {
+      int mat[3][3];
+      mat[0][0] = *(src_ptr - stride - 1);
+      mat[0][1] = *(src_ptr - stride);
+      mat[0][2] = *(src_ptr - stride + 1);
+      mat[1][0] = *(src_ptr - 1);
+      mat[1][1] = *(src_ptr);
+      mat[1][2] = *(src_ptr + 1);
+      mat[2][0] = *(src_ptr + stride - 1);
+      mat[2][1] = *(src_ptr + stride);
+      mat[2][2] = *(src_ptr + stride + 1);
+
+      // Compute Sobel gradients.
+      const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int ga = abs(gx) + abs(gy);
+
+      // Accumulate Laplacian.
+      const int is_under = ga < edge_thresh;
+      const int v = 4 * mat[1][1] -
+                    2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                    (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+      final_acc += abs(v) * is_under;
+      final_count += is_under;
+
+      src_ptr++;
+      w++;
+    }
+    src_start += stride;
+  } while (++h < height - 1);
+
+  // We counted negatively, so subtract to get the final value.
+  final_count -= horizontal_add_s32x4(count);
+  final_acc += horizontal_long_add_u32x4(acc);
+  return (final_count < 16)
+             ? -1.0
+             : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
new file mode 100644
index 0000000..5a52e70
--- /dev/null
+++ b/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// clang-format off
+
+DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
+  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+  0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+  0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+  0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+// clang-format on
+
+static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+                                const uint8_t *frame2, const uint32_t stride2,
+                                const uint32_t block_width,
+                                const uint32_t block_height,
+                                uint8_t *frame_abs_diff,
+                                const unsigned int dst_stride) {
+  uint8_t *dst = frame_abs_diff;
+
+  uint32_t i = 0;
+  do {
+    uint32_t j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+      uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      vst1q_u8(dst + j + 2, abs_diff);
+      j += 16;
+    } while (j < block_width);
+
+    dst += dst_stride;
+  } while (++i < block_height);
+}
+
+static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
+                                      const uint32_t block_width) {
+  uint8x8_t s = vld1_u8(src);
+
+  if (col == 0) {
+    const uint8_t lane2 = vget_lane_u8(s, 2);
+    s = vset_lane_u8(lane2, s, 0);
+    s = vset_lane_u8(lane2, s, 1);
+  } else if (col >= block_width - 4) {
+    const uint8_t lane5 = vget_lane_u8(s, 5);
+    s = vset_lane_u8(lane5, s, 6);
+    s = vset_lane_u8(lane5, s, 7);
+  }
+  return vcombine_u8(s, s);
+}
+
+static void apply_temporal_filter(
+    const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+    const uint32_t block_height, const int *subblock_mses,
+    unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
+    const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+    const double decay_factor, const double inv_factor,
+    const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_neon[BH][BW];
+  const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+
+  // Traverse 4 columns at a time - first and last two columns need padding.
+  for (uint32_t col = 0; col < block_width; col += 4) {
+    uint8x16_t vsrc[5][2];
+    const uint8_t *src = frame_abs_diff + col;
+
+    // Load, pad (for first and last two columns) and mask 3 rows from the top.
+    for (int i = 2; i < 5; i++) {
+      const uint8x16_t s = load_and_pad(src, col, block_width);
+      vsrc[i][0] = vandq_u8(s, vmask.val[0]);
+      vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+      src += SSE_STRIDE;
+    }
+
+    // Pad the top 2 rows.
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (unsigned int row = 0; row < block_height; row++) {
+      uint32x4_t sum_01 = vdupq_n_u32(0);
+      uint32x4_t sum_23 = vdupq_n_u32(0);
+
+      sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
+      sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
+
+      sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
+      sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
+
+      vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
+
+      // Push all rows in the sliding window up one.
+      for (int i = 0; i < 4; i++) {
+        vsrc[i][0] = vsrc[i + 1][0];
+        vsrc[i][1] = vsrc[i + 1][1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row into the bottom of the sliding window.
+        uint8x16_t s = load_and_pad(src, col, block_width);
+        vsrc[4][0] = vandq_u8(s, vmask.val[0]);
+        vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+        src += SSE_STRIDE;
+      } else {
+        // Pad the bottom 2 rows.
+        vsrc[4][0] = vsrc[3][0];
+        vsrc[4][1] = vsrc[3][1];
+      }
+    }
+  }
+
+  // Perform filtering.
+  if (tf_wgt_calc_lvl == 0) {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  } else {
+    for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      for (unsigned int j = 0; j < block_width; j++, k++) {
+        const int pixel_value = frame[i * stride + j];
+        const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+        const double window_error = diff_sse * inv_num_ref_pixels;
+        const int subblock_idx =
+            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
+        // Compute filter weight.
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor;
+        scaled_error = AOMMIN(scaled_error, 7);
+        const float fweight =
+            approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+        const int weight = iroundpf(fweight);
+        accumulator[k] += weight * pixel_value;
+        count[k] += weight;
+      }
+    }
+  }
+}
+
+void av1_apply_temporal_filter_neon_dotprod(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] +=
+                  (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+                   frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+            }
+          }
+        }
+      }
+    }
+
+    get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+                 plane_h, frame_abs_diff, SSE_STRIDE);
+
+    apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+                          subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_abs_diff, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor, tf_wgt_calc_lvl);
+
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/av1/encoder/arm/neon/txfm_neon.h b/av1/encoder/arm/neon/txfm_neon.h
new file mode 100644
index 0000000..635364f
--- /dev/null
+++ b/av1/encoder/arm/neon/txfm_neon.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+
+#include "aom/aom_integer.h"  // For AOM_INLINE.
+
+static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
+                                                  const int16_t **input,
+                                                  int *stride, int out_size) {
+  if (ud_flip) {
+    *input = *input + (out_size - 1) * *stride;
+    *stride = -*stride;
+  }
+}
+
+#endif  // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
diff --git a/av1/encoder/arm/neon/wedge_utils_neon.c b/av1/encoder/arm/neon/wedge_utils_neon.c
index 54d8d19..1b35269 100644
--- a/av1/encoder/arm/neon/wedge_utils_neon.c
+++ b/av1/encoder/arm/neon/wedge_utils_neon.c
@@ -75,3 +75,57 @@
   uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
   return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
 }
+
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
+  int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                       vdupq_n_s32(0) };
+
+  do {
+    int16x8_t ds_l = vld1q_s16(ds);
+    int16x8_t ds_h = vld1q_s16(ds + 8);
+
+    int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m));
+    int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8));
+    int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8));
+
+    acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l));
+    acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l));
+    acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h));
+    acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h));
+
+    ds += 16;
+    m += 16;
+    N -= 16;
+  } while (N != 0);
+
+  int64x2_t sum = vpaddlq_s32(acc[0]);
+  sum = vpadalq_s32(sum, acc[1]);
+  sum = vpadalq_s32(sum, acc[2]);
+  sum = vpadalq_s32(sum, acc[3]);
+
+  return (horizontal_add_s64x2(sum) > limit);
+}
+
+void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr,
+                                          const int16_t *b_ptr, int N) {
+  do {
+    int16x8_t a = vld1q_s16(a_ptr);
+    int16x8_t b = vld1q_s16(b_ptr);
+
+    int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+    int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+
+    sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b));
+    sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b));
+
+    int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi));
+
+    vst1q_s16(d_ptr, res);
+
+    d_ptr += 8;
+    a_ptr += 8;
+    b_ptr += 8;
+    N -= 8;
+  } while (N != 0);
+}
diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index 4419085..25007bb 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c
@@ -34,18 +34,19 @@
 #endif
 
 void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+  const int64_t area = (int64_t)width * height;
   ne->enabled = 0;
-  ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
+  ne->level = (area < 1280 * 720) ? kLowLow : kLow;
   ne->value = 0;
   ne->count = 0;
   ne->thresh = 90;
   ne->last_w = 0;
   ne->last_h = 0;
-  if (width * height >= 1920 * 1080) {
+  if (area >= 1920 * 1080) {
     ne->thresh = 200;
-  } else if (width * height >= 1280 * 720) {
+  } else if (area >= 1280 * 720) {
     ne->thresh = 140;
-  } else if (width * height >= 640 * 360) {
+  } else if (area >= 640 * 360) {
     ne->thresh = 115;
   }
   ne->num_frames_estimate = 15;
@@ -171,7 +172,7 @@
     unsigned int max_bin = 0;
     unsigned int max_bin_count = 0;
     unsigned int bin_cnt;
-    int bsize = BLOCK_16X16;
+    BLOCK_SIZE bsize = BLOCK_16X16;
     // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
     // been encoded as zero/small mv at least x consecutive frames, compute
     // the variance to update estimate of noise in the source.
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 39aa027..a9e7978 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -43,6 +43,7 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/palette.h"
+#include "av1/encoder/pickrst.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 
@@ -64,7 +65,7 @@
 
 #if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void loop_restoration_write_sb_coeffs(
-    const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
     aom_writer *const w, int plane, FRAME_COUNTS *counts);
 #endif
 
@@ -1027,9 +1028,10 @@
     write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
     if (uv_mode == UV_CFL_PRED)
       write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
-    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+    const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+    if (use_angle_delta && av1_is_directional_mode(intra_mode)) {
       write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
-                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
+                        ec_ctx->angle_delta_cdf[intra_mode - V_PRED]);
     }
   }
 
@@ -1621,15 +1623,18 @@
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
     int rcol0, rcol1, rrow0, rrow1;
+
+    // Skip some unnecessary work if loop restoration is disabled
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
     if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
                                            &rcol0, &rcol1, &rrow0, &rrow1)) {
-      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      const int rstride = cm->rst_info[plane].horz_units;
       for (int rrow = rrow0; rrow < rrow1; ++rrow) {
         for (int rcol = rcol0; rcol < rcol1; ++rcol) {
           const int runit_idx = rcol + rrow * rstride;
-          const RestorationUnitInfo *rui =
-              &cm->rst_info[plane].unit_info[runit_idx];
-          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, td->counts);
+          loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane,
+                                           td->counts);
         }
       }
     }
@@ -1913,8 +1918,9 @@
 }
 
 static AOM_INLINE void loop_restoration_write_sb_coeffs(
-    const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
     aom_writer *const w, int plane, FRAME_COUNTS *counts) {
+  const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx];
   const RestorationInfo *rsi = cm->rst_info + plane;
   RestorationType frame_rtype = rsi->frame_restoration_type;
   assert(frame_rtype != RESTORE_NONE);
@@ -1935,9 +1941,21 @@
 #endif
     switch (unit_rtype) {
       case RESTORE_WIENER:
+#if DEBUG_LR_COSTING
+        assert(!memcmp(
+            ref_wiener_info,
+            &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info,
+            sizeof(*ref_wiener_info)));
+#endif
         write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
         break;
       case RESTORE_SGRPROJ:
+#if DEBUG_LR_COSTING
+        assert(!memcmp(&ref_sgrproj_info->xqd,
+                       &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx]
+                            .sgrproj_info.xqd,
+                       sizeof(ref_sgrproj_info->xqd)));
+#endif
         write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
         break;
       default: assert(unit_rtype == RESTORE_NONE); break;
@@ -1949,6 +1967,12 @@
     ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
 #endif
     if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+      assert(
+          !memcmp(ref_wiener_info,
+                  &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info,
+                  sizeof(*ref_wiener_info)));
+#endif
       write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
     }
   } else if (frame_rtype == RESTORE_SGRPROJ) {
@@ -1958,6 +1982,12 @@
     ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
 #endif
     if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+      assert(!memcmp(
+          &ref_sgrproj_info->xqd,
+          &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd,
+          sizeof(ref_sgrproj_info->xqd)));
+#endif
       write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
     }
   }
@@ -3335,7 +3365,7 @@
   aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
   aom_wb_write_literal(&wb, (int)obu_type, 4);
   aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
-  aom_wb_write_literal(&wb, 1, 1);  // obu_has_payload_length_field
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_size_field
   aom_wb_write_literal(&wb, 0, 1);  // reserved
 
   if (obu_extension) {
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index 5999f9e..12e8a63 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -74,6 +74,9 @@
 
   // Index of next job to be processed.
   int next_job_idx;
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool pack_bs_mt_exit;
 } AV1EncPackBSSync;
 
 /*!\endcond */
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 360b9d4..33d2d8c 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1322,6 +1322,12 @@
   uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1];
   //! Color sensitivity flag for the coding block.
   uint8_t color_sensitivity[MAX_MB_PLANE - 1];
+  //! Coding block distortion value for uv/color, minimum over the inter modes.
+  int64_t min_dist_inter_uv;
+
+  //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane
+  // so we can keep dqcoeff of the best tx_type.
+  tran_low_t *dqcoeff_buf;
   /**@}*/
 
   /*****************************************************************************
@@ -1330,6 +1336,18 @@
   /**@{*/
   //! Variance of the source frame.
   unsigned int source_variance;
+  //! Flag to indicate coding block is zero sad.
+  int block_is_zero_sad;
+  //! Flag to indicate superblock ME in variance partition is determined to be
+  // good/reliable, and so the superblock MV will be tested in the
+  // nonrd_pickmode. This is only used for LAST_FRAME.
+  int sb_me_partition;
+  //! Flag to indicate to test the superblock MV for the coding block in the
+  // nonrd_pickmode.
+  int sb_me_block;
+  //! Motion vector from superblock MV derived from int_pro_motion() in
+  // the variance_partitioning.
+  int_mv sb_me_mv;
   //! SSE of the current predictor.
   unsigned int pred_sse[REF_FRAMES];
   //! Prediction for ML based partition.
@@ -1366,6 +1384,23 @@
    * fast encoding stage for screen content tool detemination.
    */
   int palette_pixels;
+
+  /*!\brief Pointer to the structure which stores the statistics used by
+   * sb-level multi-pass encoding.
+   */
+  struct SB_FIRST_PASS_STATS *sb_stats_cache;
+
+  /*!\brief Pointer to the structure which stores the statistics used by
+   * first-pass when superblock is searched twice consecutively.
+   */
+  struct SB_FIRST_PASS_STATS *sb_fp_stats;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+  /*!\brief Pointer to RD_STATS structure to be used in
+   * av1_rd_partition_search().
+   */
+  RD_STATS *rdcost;
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
 } MACROBLOCK;
 #undef SINGLE_REF_MODES
 
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 1992f23..3b0ee88 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -1024,8 +1024,9 @@
                                         int64_t ref_skip_rd, int mode_rate) {
   int eval_txfm = 1;
   const int txfm_rd_gate_level =
-      get_txfm_rd_gate_level(cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
-                             TX_SEARCH_DEFAULT, /*eval_motion_mode=*/0);
+      get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound,
+                             cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+                             TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
   // Check if the mode is good enough based on skip rd
   if (txfm_rd_gate_level) {
     int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
@@ -1108,8 +1109,9 @@
   // TODO(nithya): Handle wedge_newmv_search if extending for lower speed
   // setting
   const int txfm_rd_gate_level =
-      get_txfm_rd_gate_level(cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
-                             TX_SEARCH_DEFAULT, /*eval_motion_mode=*/0);
+      get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+                             cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+                             TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
   if (txfm_rd_gate_level) {
     int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
                                     txfm_rd_gate_level, 1);
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 2bd2d7f..7b8240d 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -304,8 +304,6 @@
 }
 
 void av1_free_sms_tree(ThreadData *td) {
-  if (td->sms_tree != NULL) {
-    aom_free(td->sms_tree);
-    td->sms_tree = NULL;
-  }
+  aom_free(td->sms_tree);
+  td->sms_tree = NULL;
 }
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 90279b0..878cec5 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -724,6 +724,7 @@
 #endif
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
+
   GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   FRAME_UPDATE_TYPE update_type =
       get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
@@ -1658,6 +1659,15 @@
     cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
   }
 
+  const int is_intra_frame = frame_params.frame_type == KEY_FRAME ||
+                             frame_params.frame_type == INTRA_ONLY_FRAME;
+  FeatureFlags *const features = &cm->features;
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) &&
+      is_intra_frame) {
+    av1_set_screen_content_options(cpi, features);
+  }
+
 #if CONFIG_REALTIME_ONLY
   if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
       AOM_CODEC_OK) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 50f046d..2c6e49f 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -523,7 +523,7 @@
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
   const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
-  PC_TREE *const pc_root = td->rt_pc_root;
+  PC_TREE *const pc_root = td->pc_root;
 
 #if CONFIG_RT_ML_PARTITIONING
   if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
@@ -731,9 +731,12 @@
     av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
     cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex;
 
-    PC_TREE *const pc_root = av1_alloc_pc_tree_node(bsize);
+    td->pc_root = av1_alloc_pc_tree_node(bsize);
+    if (!td->pc_root)
+      aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
     av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
-                          &cur_rdc, cur_rdc, pc_root, sms_tree, NULL,
+                          &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL,
                           SB_DRY_PASS, NULL);
 
     if ((rdc_winner.rdcost > cur_rdc.rdcost) ||
@@ -760,6 +763,7 @@
                                     const int seg_skip) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const SPEED_FEATURES *const sf = &cpi->sf;
   const TileInfo *const tile_info = &tile_data->tile_info;
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
@@ -787,11 +791,15 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, rd_use_partition_time);
 #endif
-    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    td->pc_root = av1_alloc_pc_tree_node(sb_size);
+    if (!td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
     av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                         &dummy_rate, &dummy_dist, 1, pc_root);
-    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0,
+                         &dummy_rate, &dummy_dist, 1, td->pc_root);
+    av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
                                sf->part_sf.partition_search_type);
+    td->pc_root = NULL;
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, rd_use_partition_time);
 #endif
@@ -803,20 +811,16 @@
     const BLOCK_SIZE bsize =
         seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    td->pc_root = av1_alloc_pc_tree_node(sb_size);
+    if (!td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
     av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                         &dummy_rate, &dummy_dist, 1, pc_root);
-    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0,
+                         &dummy_rate, &dummy_dist, 1, td->pc_root);
+    av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
                                sf->part_sf.partition_search_type);
+    td->pc_root = NULL;
   } else {
-    SB_FIRST_PASS_STATS *sb_org_stats = NULL;
-
-    if (cpi->oxcf.sb_qp_sweep) {
-      CHECK_MEM_ERROR(
-          cm, sb_org_stats,
-          (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(SB_FIRST_PASS_STATS)));
-      av1_backup_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
-    }
     // The most exhaustive recursive partition search
     SuperBlockEnc *sb_enc = &x->sb_enc;
     // No stats for overlay frames. Exclude key frame.
@@ -843,12 +847,16 @@
         !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
           cpi->oxcf.gf_cfg.lag_in_frames == 0) &&
         cm->delta_q_info.delta_q_present_flag) {
+      AOM_CHECK_MEM_ERROR(
+          x->e_mbd.error_info, td->mb.sb_stats_cache,
+          (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache)));
+      av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+                          mi_col);
       assert(x->rdmult_delta_qindex == x->delta_qindex);
-      assert(sb_org_stats);
 
       const int best_qp_diff =
           sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root,
-                      sb_org_stats) -
+                      td->mb.sb_stats_cache) -
           x->rdmult_delta_qindex;
 
       sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc,
@@ -859,10 +867,13 @@
           cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
 
       av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
-      av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+      av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+                           mi_col);
 
       cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
           backup_current_qindex;
+      aom_free(td->mb.sb_stats_cache);
+      td->mb.sb_stats_cache = NULL;
     }
     if (num_passes == 1) {
 #if CONFIG_PARTITION_SEARCH_ORDER
@@ -873,24 +884,36 @@
         av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
                                 mi_col, sb_size, &this_rdc);
       } else {
-        PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+        td->pc_root = av1_alloc_pc_tree_node(sb_size);
+        if (!td->pc_root)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PC_TREE");
         av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                              &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
-                              SB_SINGLE_PASS, NULL);
+                              &dummy_rdc, dummy_rdc, td->pc_root, sms_root,
+                              NULL, SB_SINGLE_PASS, NULL);
       }
 #else
-      PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+      td->pc_root = av1_alloc_pc_tree_node(sb_size);
+      if (!td->pc_root)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
       av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                            &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
+                            &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
                             SB_SINGLE_PASS, NULL);
 #endif  // CONFIG_PARTITION_SEARCH_ORDER
     } else {
       // First pass
-      SB_FIRST_PASS_STATS sb_fp_stats;
-      av1_backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
-      PC_TREE *const pc_root_p0 = av1_alloc_pc_tree_node(sb_size);
+      AOM_CHECK_MEM_ERROR(
+          x->e_mbd.error_info, td->mb.sb_fp_stats,
+          (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats)));
+      av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+                          mi_col);
+      td->pc_root = av1_alloc_pc_tree_node(sb_size);
+      if (!td->pc_root)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
       av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                            &dummy_rdc, dummy_rdc, pc_root_p0, sms_root, NULL,
+                            &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
                             SB_DRY_PASS, NULL);
 
       // Second pass
@@ -899,14 +922,19 @@
       av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
       av1_reset_simple_motion_tree_partition(sms_root, sb_size);
 
-      av1_restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+                           mi_col);
 
-      PC_TREE *const pc_root_p1 = av1_alloc_pc_tree_node(sb_size);
+      td->pc_root = av1_alloc_pc_tree_node(sb_size);
+      if (!td->pc_root)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
       av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                            &dummy_rdc, dummy_rdc, pc_root_p1, sms_root, NULL,
+                            &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
                             SB_WET_PASS, NULL);
+      aom_free(td->mb.sb_fp_stats);
+      td->mb.sb_fp_stats = NULL;
     }
-    aom_free(sb_org_stats);
 
     // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
     sb_enc->tpl_data_count = 0;
@@ -1124,6 +1152,17 @@
     // top-right superblock to finish encoding.
     enc_row_mt->sync_read_ptr(
         row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi));
+
+#if CONFIG_MULTITHREAD
+    if (row_mt_enabled) {
+      pthread_mutex_lock(enc_row_mt->mutex_);
+      const bool row_mt_exit = enc_row_mt->row_mt_exit;
+      pthread_mutex_unlock(enc_row_mt->mutex_);
+      // Exit in case any worker has encountered an error.
+      if (row_mt_exit) return;
+    }
+#endif
+
     const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
     if (update_cdf && (tile_info->mi_row_start != mi_row)) {
       if ((tile_info->mi_col_start == mi_col)) {
@@ -1155,6 +1194,9 @@
     x->content_state_sb.lighting_change = 0;
     x->content_state_sb.low_sumdiff = 0;
     x->force_zeromv_skip_for_sb = 0;
+    x->sb_me_block = 0;
+    x->sb_me_partition = 0;
+    x->sb_me_mv.as_int = 0;
 
     if (cpi->oxcf.mode == ALLINTRA) {
       x->intra_sb_rdmult_modifier = 128;
@@ -1230,7 +1272,7 @@
 
   av1_row_mt_mem_dealloc(cpi);
 
-  if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+  aom_free(cpi->tile_data);
   CHECK_MEM_ERROR(
       cm, cpi->tile_data,
       aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
@@ -1441,7 +1483,7 @@
     }
   }
 
-  av1_dealloc_mb_data(cm, mb);
+  av1_dealloc_mb_data(mb, av1_num_planes(cm));
 }
 
 // Set the relative distance of a reference frame w.r.t. current frame
@@ -1670,6 +1712,19 @@
   }
 }
 
+static void free_block_hash_buffers(uint32_t *block_hash_values[2][2],
+                                    int8_t *is_block_same[2][3]) {
+  for (int k = 0; k < 2; ++k) {
+    for (int j = 0; j < 2; ++j) {
+      aom_free(block_hash_values[k][j]);
+    }
+
+    for (int j = 0; j < 3; ++j) {
+      aom_free(is_block_same[k][j]);
+    }
+  }
+}
+
 /*!\brief Encoder setup(only for the current frame), encoding, and recontruction
  * for a single frame
  *
@@ -1740,26 +1795,34 @@
     // add to hash table
     const int pic_width = cpi->source->y_crop_width;
     const int pic_height = cpi->source->y_crop_height;
-    uint32_t *block_hash_values[2][2];
-    int8_t *is_block_same[2][3];
+    uint32_t *block_hash_values[2][2] = { { NULL } };
+    int8_t *is_block_same[2][3] = { { NULL } };
     int k, j;
+    bool error = false;
 
-    for (k = 0; k < 2; k++) {
-      for (j = 0; j < 2; j++) {
-        CHECK_MEM_ERROR(cm, block_hash_values[k][j],
-                        aom_malloc(sizeof(uint32_t) * pic_width * pic_height));
+    for (k = 0; k < 2 && !error; ++k) {
+      for (j = 0; j < 2; ++j) {
+        block_hash_values[k][j] = (uint32_t *)aom_malloc(
+            sizeof(*block_hash_values[0][0]) * pic_width * pic_height);
+        if (!block_hash_values[k][j]) {
+          error = true;
+          break;
+        }
       }
 
-      for (j = 0; j < 3; j++) {
-        CHECK_MEM_ERROR(cm, is_block_same[k][j],
-                        aom_malloc(sizeof(int8_t) * pic_width * pic_height));
+      for (j = 0; j < 3 && !error; ++j) {
+        is_block_same[k][j] = (int8_t *)aom_malloc(
+            sizeof(*is_block_same[0][0]) * pic_width * pic_height);
+        if (!is_block_same[k][j]) error = true;
       }
     }
 
     av1_hash_table_init(intrabc_hash_info);
-    if (!av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+    if (error ||
+        !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+      free_block_hash_buffers(block_hash_values, is_block_same);
       aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
-                         "Error allocating intrabc_hash_table");
+                         "Error allocating intrabc_hash_table and buffers");
     }
     hash_table_created = 1;
     av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
@@ -1769,7 +1832,6 @@
     const int max_sb_size =
         (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
     int src_idx = 0;
-    bool error = false;
     for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
       const int dst_idx = !src_idx;
       av1_generate_block_hash_value(
@@ -1787,15 +1849,7 @@
       }
     }
 
-    for (k = 0; k < 2; k++) {
-      for (j = 0; j < 2; j++) {
-        aom_free(block_hash_values[k][j]);
-      }
-
-      for (j = 0; j < 3; j++) {
-        aom_free(is_block_same[k][j]);
-      }
-    }
+    free_block_hash_buffers(block_hash_values, is_block_same);
 
     if (error) {
       aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
@@ -1957,12 +2011,19 @@
       // Preallocate the pc_tree for realtime coding to reduce the cost of
       // memory allocation.
       const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
-      td->rt_pc_root = use_nonrd_mode
-                           ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
-                           : NULL;
+      if (use_nonrd_mode) {
+        td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+        if (!td->pc_root)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PC_TREE");
+      } else {
+        td->pc_root = NULL;
+      }
+
       encode_tiles(cpi);
-      av1_free_pc_tree_recursive(td->rt_pc_root, av1_num_planes(cm), 0, 0,
+      av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                                  cpi->sf.part_sf.partition_search_type);
+      td->pc_root = NULL;
     }
   }
 
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 29d7fe4..94298c8 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -22,7 +22,7 @@
                          const int mi_col, int *const rdmult) {
   const AV1_COMMON *const cm = &cpi->common;
 
-  const int bsize_base = BLOCK_16X16;
+  const BLOCK_SIZE bsize_base = BLOCK_16X16;
   const int num_mi_w = mi_size_wide[bsize_base];
   const int num_mi_h = mi_size_high[bsize_base];
   const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
@@ -177,7 +177,7 @@
   const int block_mi_width_sr =
       coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
 
-  const int bsize_base = BLOCK_16X16;
+  const BLOCK_SIZE bsize_base = BLOCK_16X16;
   const int num_mi_w = mi_size_wide[bsize_base];
   const int num_mi_h = mi_size_high[bsize_base];
   const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
@@ -588,13 +588,13 @@
       update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
     }
   }
-  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
-      av1_use_angle_delta(bsize)) {
+  const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+  if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) {
 #if CONFIG_ENTROPY_STATS
-    ++counts->angle_delta[uv_mode - UV_V_PRED]
+    ++counts->angle_delta[intra_mode - V_PRED]
                          [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
 #endif
-    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+    update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED],
                mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
                2 * MAX_ANGLE_DELTA + 1);
   }
@@ -1743,3 +1743,23 @@
     default: assert(0);
   }
 }
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    aom_free(mb->plane[plane].src_diff);
+    mb->plane[plane].src_diff = NULL;
+  }
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) {
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_xy =
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
+    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+    CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+                    (int16_t *)aom_memalign(
+                        32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+  }
+}
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 24a36c5..14c71b8 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -430,25 +430,26 @@
                            const TileInfo *const tile_info, const int mi_row,
                            const int mi_col);
 
-static AOM_INLINE void av1_dealloc_mb_data(struct AV1Common *cm,
-                                           struct macroblock *mb) {
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes);
+
+static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
+                                           int num_planes) {
   aom_free(mb->txfm_search_info.mb_rd_record);
   mb->txfm_search_info.mb_rd_record = NULL;
 
   aom_free(mb->inter_modes_info);
   mb->inter_modes_info = NULL;
 
-  const int num_planes = av1_num_planes(cm);
-  for (int plane = 0; plane < num_planes; plane++) {
-    aom_free(mb->plane[plane].src_diff);
-    mb->plane[plane].src_diff = NULL;
-  }
+  av1_dealloc_src_diff_buf(mb, num_planes);
 
   aom_free(mb->e_mbd.seg_mask);
   mb->e_mbd.seg_mask = NULL;
 
   aom_free(mb->winner_mode_stats);
   mb->winner_mode_stats = NULL;
+
+  aom_free(mb->dqcoeff_buf);
+  mb->dqcoeff_buf = NULL;
 }
 
 static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
@@ -468,6 +469,8 @@
                       winner_mode_count * sizeof(mb->winner_mode_stats[0])));
 }
 
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb);
+
 static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
                                          struct macroblock *mb) {
   const AV1_COMMON *cm = &cpi->common;
@@ -483,21 +486,20 @@
           cm, mb->inter_modes_info,
           (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
   }
-  const int num_planes = av1_num_planes(cm);
-  for (int plane = 0; plane < num_planes; plane++) {
-    const int subsampling_xy =
-        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
-              : 0;
-    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
-    CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
-                    (int16_t *)aom_memalign(
-                        32, sizeof(*mb->plane[plane].src_diff) * sb_size));
-  }
+
+  av1_alloc_src_diff_buf(cm, mb);
+
   CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
                   (uint8_t *)aom_memalign(
                       16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
 
   allocate_winner_mode_stats(cpi, mb);
+
+  const int max_sb_square_y = 1
+                              << num_pels_log2_lookup[cm->seq_params->sb_size];
+  CHECK_MEM_ERROR(
+      cm, mb->dqcoeff_buf,
+      (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t)));
 }
 
 // This function will compute the number of reference frames to be disabled
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 78efa0c..c78761d 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -449,10 +449,16 @@
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
   if (p->eobs[block]) {
-    *(args->skip) = 0;
+    // As long as any YUV plane has non-zero quantized transform coefficients,
+    // mbmi->skip_txfm flag is set to 0.
+    mbmi->skip_txfm = 0;
     av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
                                 pd->dst.stride, p->eobs[block],
                                 cm->features.reduced_tx_set_used);
+  } else {
+    // Only when YUV planes all have zero quantized transform coefficients,
+    // mbmi->skip_txfm flag is set to 1.
+    mbmi->skip_txfm &= 1;
   }
 
   // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
@@ -650,13 +656,19 @@
   assert(bsize < BLOCK_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
+  // In the current encoder implementation, for inter blocks,
+  // only when YUV planes all have zero quantized transform coefficients,
+  // mbmi->skip_txfm flag is set to 1.
+  // For intra blocks, this flag is set to 0 since skipped blocks are so rare
+  // that transmitting skip_txfm = 1 is very expensive.
+  // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based
+  // on transform, quantization, and (if exists) trellis optimization.
   mbmi->skip_txfm = 1;
   if (x->txfm_search_info.skip_txfm) return;
 
   struct optimize_ctx ctx;
   struct encode_b_args arg = {
-    cpi,  x,    &ctx,    &mbmi->skip_txfm,
-    NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+    cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
   };
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -727,6 +739,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
@@ -820,9 +833,9 @@
     update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
   }
 
-  // For intra mode, skipped blocks are so rare that transmitting skip=1 is
-  // very expensive.
-  *(args->skip) = 0;
+  // For intra mode, skipped blocks are so rare that transmitting
+  // skip_txfm = 1 is very expensive.
+  mbmi->skip_txfm = 0;
 
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
@@ -841,8 +854,9 @@
   const int ss_y = pd->subsampling_y;
   ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
   ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
-  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip_txfm),
-                               ta,  tl, dry_run, enable_optimize_b };
+  struct encode_b_args arg = {
+    cpi, x, NULL, ta, tl, dry_run, enable_optimize_b
+  };
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
   if (enable_optimize_b) {
     av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index b819e82..f97bf8f 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -56,7 +56,6 @@
   const struct AV1_COMP *cpi;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  uint8_t *skip;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   RUN_TYPE dry_run;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index e1cb49b..550504f 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -236,16 +236,16 @@
                                  size_t encoded_frame_size) {
   const int upscaled_width = cm->superres_upscaled_width;
   const int height = cm->height;
-  const int luma_pic_size = upscaled_width * height;
+  const int64_t luma_pic_size = (int64_t)upscaled_width * height;
   const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int pic_size_profile_factor =
       profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
   encoded_frame_size =
       (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
-  const size_t uncompressed_frame_size =
+  const int64_t uncompressed_frame_size =
       (luma_pic_size * pic_size_profile_factor) >> 3;
-  return uncompressed_frame_size / (double)encoded_frame_size;
+  return (double)uncompressed_frame_size / encoded_frame_size;
 }
 
 static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs,
@@ -362,9 +362,9 @@
 static INLINE int does_level_match(int width, int height, double fps,
                                    int lvl_width, int lvl_height,
                                    double lvl_fps, int lvl_dim_mult) {
-  const int64_t lvl_luma_pels = lvl_width * lvl_height;
+  const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height;
   const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
-  const int64_t luma_pels = width * height;
+  const int64_t luma_pels = (int64_t)width * height;
   const double display_sample_rate = luma_pels * fps;
   return luma_pels <= lvl_luma_pels &&
          display_sample_rate <= lvl_display_sample_rate &&
@@ -1278,7 +1278,7 @@
     enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
                   BLOCK_4X4);
 
-    const int bsize = BLOCK_16X16;
+    const BLOCK_SIZE bsize = BLOCK_16X16;
     const int w = mi_size_wide[bsize];
     const int h = mi_size_high[bsize];
     const int num_cols = (mi_params.mi_cols + w - 1) / w;
@@ -1410,6 +1410,8 @@
   cm->current_frame.frame_number = 0;
   cpi->rc.frame_number_encoded = 0;
   cpi->rc.prev_frame_is_dropped = 0;
+  cpi->rc.max_consec_drop = INT_MAX;
+  cpi->rc.drop_count_consec = 0;
   cm->current_frame_id = -1;
   cpi->tile_data = NULL;
   cpi->last_show_frame_buf = NULL;
@@ -1493,9 +1495,10 @@
   cpi->mb_weber_stats = NULL;
   cpi->mb_delta_q = NULL;
   cpi->palette_pixel_num = 0;
+  cpi->scaled_last_source_available = 0;
 
   {
-    const int bsize = BLOCK_16X16;
+    const BLOCK_SIZE bsize = BLOCK_16X16;
     const int w = mi_size_wide[bsize];
     const int h = mi_size_high[bsize];
     const int num_cols = (max_mi_cols + w - 1) / w;
@@ -1510,7 +1513,7 @@
 
 #if CONFIG_TUNE_VMAF
   {
-    const int bsize = BLOCK_64X64;
+    const BLOCK_SIZE bsize = BLOCK_64X64;
     const int w = mi_size_wide[bsize];
     const int h = mi_size_high[bsize];
     const int num_cols = (mi_params->mi_cols + w - 1) / w;
@@ -1699,6 +1702,7 @@
   pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
   pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_;
   pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_;
   pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
   if (enc_row_mt_mutex_ != NULL) {
     pthread_mutex_destroy(enc_row_mt_mutex_);
@@ -1712,6 +1716,10 @@
     pthread_mutex_destroy(gm_mt_mutex_);
     aom_free(gm_mt_mutex_);
   }
+  if (tpl_error_mutex_ != NULL) {
+    pthread_mutex_destroy(tpl_error_mutex_);
+    aom_free(tpl_error_mutex_);
+  }
   if (pack_bs_mt_mutex_ != NULL) {
     pthread_mutex_destroy(pack_bs_mt_mutex_);
     aom_free(pack_bs_mt_mutex_);
@@ -1720,11 +1728,11 @@
   av1_row_mt_mem_dealloc(cpi);
 
   if (mt_info->num_workers > 1) {
+    av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
     av1_loop_filter_dealloc(&mt_info->lf_row_sync);
     av1_cdef_mt_dealloc(&mt_info->cdef_sync);
 #if !CONFIG_REALTIME_ONLY
     av1_loop_restoration_dealloc(&mt_info->lr_row_sync);
-    av1_gm_dealloc(&mt_info->gm_sync);
     av1_tf_mt_dealloc(&mt_info->tf_sync);
 #endif
   }
@@ -1950,6 +1958,7 @@
   const int stride = cpi->unfiltered_source->y_stride;
   const int width = cpi->unfiltered_source->y_width;
   const int height = cpi->unfiltered_source->y_height;
+  const int64_t area = (int64_t)width * height;
   const int bd = cm->seq_params->bit_depth;
   const int blk_w = 16;
   const int blk_h = 16;
@@ -1957,10 +1966,10 @@
   const int color_thresh = 4;
   const unsigned int var_thresh = 0;
   // Counts of blocks with no more than color_thresh colors.
-  int counts_1 = 0;
+  int64_t counts_1 = 0;
   // Counts of blocks with no more than color_thresh colors and variance larger
   // than var_thresh.
-  int counts_2 = 0;
+  int64_t counts_2 = 0;
 
   for (int r = 0; r + blk_h <= height; r += blk_h) {
     for (int c = 0; c + blk_w <= width; c += blk_w) {
@@ -1985,17 +1994,15 @@
   }
 
   // The threshold values are selected experimentally.
-  features->allow_screen_content_tools =
-      counts_1 * blk_h * blk_w * 10 > width * height;
+  features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area;
   // IntraBC would force loop filters off, so we use more strict rules that also
   // requires that the block has high variance.
   features->allow_intrabc = features->allow_screen_content_tools &&
-                            counts_2 * blk_h * blk_w * 12 > width * height;
+                            counts_2 * blk_h * blk_w * 12 > area;
   cpi->use_screen_content_tools = features->allow_screen_content_tools;
   cpi->is_screen_content_type =
-      features->allow_intrabc ||
-      (counts_1 * blk_h * blk_w * 10 > width * height * 4 &&
-       counts_2 * blk_h * blk_w * 30 > width * height);
+      features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 &&
+                                  counts_2 * blk_h * blk_w * 30 > area);
 }
 
 static void init_motion_estimation(AV1_COMP *cpi) {
@@ -2044,29 +2051,6 @@
   }
 }
 
-#if !CONFIG_REALTIME_ONLY
-#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
-static void set_restoration_unit_size(int width, int height, int sx, int sy,
-                                      RestorationInfo *rst) {
-  (void)width;
-  (void)height;
-  (void)sx;
-  (void)sy;
-#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
-  int s = AOMMIN(sx, sy);
-#else
-  int s = 0;
-#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
-
-  if (width * height > 352 * 288)
-    rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
-  else
-    rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
-  rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
-  rst[2].restoration_unit_size = rst[1].restoration_unit_size;
-}
-#endif
-
 static void init_ref_frame_bufs(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int i;
@@ -2232,11 +2216,6 @@
 
 #if !CONFIG_REALTIME_ONLY
   if (is_restoration_used(cm)) {
-    const int frame_width = cm->superres_upscaled_width;
-    const int frame_height = cm->superres_upscaled_height;
-    set_restoration_unit_size(frame_width, frame_height,
-                              seq_params->subsampling_x,
-                              seq_params->subsampling_y, cm->rst_info);
     for (int i = 0; i < num_planes; ++i)
       cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
 
@@ -2310,18 +2289,8 @@
     start_timing(cpi, cdef_time);
 #endif
     const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
-    const int use_screen_content_model =
-        cm->quant_params.base_qindex >
-            AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
-                   cpi->rc.best_quality + 5) &&
-        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
     // Find CDEF parameters
-    av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
-                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult,
-                    cpi->sf.rt_sf.skip_cdef_sb, cpi->oxcf.tool_cfg.cdef_control,
-                    use_screen_content_model,
-                    cpi->ppi->rtc_ref.non_reference_frame,
-                    cpi->rc.rtc_external_ratectrl);
+    av1_cdef_search(cpi);
 
     // Apply the filter
     if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) {
@@ -2499,7 +2468,6 @@
   const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
   SVC *const svc = &cpi->svc;
   const int resize_pending = is_frame_resize_pending(cpi);
-
   int top_index = 0, bottom_index = 0, q = 0;
   YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
   InterpFilter filter_scaler =
@@ -2588,7 +2556,10 @@
     memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv));
   }
 
-  if (cpi->unscaled_last_source != NULL) {
+  if (cpi->scaled_last_source_available) {
+    cpi->last_source = &cpi->scaled_last_source;
+    cpi->scaled_last_source_available = 0;
+  } else if (cpi->unscaled_last_source != NULL) {
     cpi->last_source = av1_realloc_and_scale_if_required(
         cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
         phase_scaler, true, false, cpi->oxcf.border_in_pixels,
@@ -2610,12 +2581,15 @@
   // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger)
   // resized frame and ALTREF will be refreshed ~4 frames later, so both
   // references become available again after few frames.
+  // For superres: don't disable golden reference.
   if (svc->number_spatial_layers == 1) {
-    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
-      const YV12_BUFFER_CONFIG *const ref =
-          get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
-        cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+    if (!cpi->oxcf.superres_cfg.enable_superres) {
+      if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
+        const YV12_BUFFER_CONFIG *const ref =
+            get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+        if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+          cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+      }
     }
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
       const YV12_BUFFER_CONFIG *const ref =
@@ -2632,16 +2606,9 @@
 #endif  // CONFIG_FPMT_TEST
   if (scale_references ||
       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
-    // For SVC the inter-layer/spatial prediction is not done for newmv
-    // (zero_mode is forced), and since the scaled references are only
-    // use for newmv search, we can avoid scaling here when
-    // force_zero_mode_spatial_ref is set for SVC mode.
-    // Also add condition for dynamic_resize: for dynamic_resize we always
-    // check for scaling references for now.
-    if (!frame_is_intra_only(cm) &&
-        (!cpi->ppi->use_svc || !cpi->svc.force_zero_mode_spatial_ref ||
-         cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC))
+    if (!frame_is_intra_only(cm)) {
       av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+    }
   }
 
   av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
@@ -2733,6 +2700,22 @@
       sf->rt_sf.gf_refresh_based_on_qp)
     av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
 
+  // For non-svc: if scaling is required, copy scaled_source
+  // into scaled_last_source.
+  if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc &&
+      cpi->scaled_source.y_buffer != NULL &&
+      cpi->scaled_last_source.y_buffer != NULL &&
+      cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width &&
+      cpi->scaled_source.y_crop_height ==
+          cpi->scaled_last_source.y_crop_height &&
+      (cm->width != cpi->unscaled_source->y_crop_width ||
+       cm->height != cpi->unscaled_source->y_crop_height)) {
+    cpi->scaled_last_source_available = 1;
+    aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source);
+    aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source);
+    aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source);
+  }
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_encode_frame_time);
 #endif
@@ -3051,7 +3034,7 @@
              rc->projected_frame_size, psnr.sse[0]);
       ++rd_command->frame_index;
       if (rd_command->frame_index == rd_command->frame_count) {
-        exit(0);
+        return AOM_CODEC_ERROR;
       }
 #endif  // CONFIG_RD_COMMAND
 
@@ -3585,10 +3568,6 @@
   start_timing(cpi, encode_frame_to_data_rate_time);
 #endif
 
-  if (frame_is_intra_only(cm)) {
-    av1_set_screen_content_options(cpi, features);
-  }
-
 #if !CONFIG_REALTIME_ONLY
   calculate_frame_avg_haar_energy(cpi);
 #endif
@@ -3736,19 +3715,14 @@
     cpi->num_tg = DEFAULT_MAX_NUM_TG;
   }
 
-  // For 1 pass CBR, check if we are dropping this frame.
-  // Never drop on key frame, or for frame whose base layer is key.
-  if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR &&
-      current_frame->frame_type != KEY_FRAME &&
-      !(cpi->ppi->use_svc &&
-        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
-    FRAME_UPDATE_TYPE update_type =
-        cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
-    (void)update_type;
-    assert(
-        IMPLIES(cpi->is_dropped_frame, (update_type == OVERLAY_UPDATE ||
-                                        update_type == INTNL_OVERLAY_UPDATE)));
-    if (av1_rc_drop_frame(cpi)) {
+  // For 1 pass CBR mode: check if we are dropping this frame.
+  if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) {
+    // Always drop for spatial enhancement layer if layer bandwidth is 0.
+    // Otherwise check for frame-dropping based on buffer level in
+    // av1_rc_drop_frame().
+    if ((cpi->svc.spatial_layer_id > 0 &&
+         cpi->oxcf.rc_cfg.target_bandwidth == 0) ||
+        av1_rc_drop_frame(cpi)) {
       cpi->is_dropped_frame = true;
     }
     if (cpi->is_dropped_frame) {
@@ -4438,10 +4412,8 @@
 
     fclose(f);
 
-    if (ppi->ssim_vars != NULL) {
-      aom_free(ppi->ssim_vars);
-      ppi->ssim_vars = NULL;
-    }
+    aom_free(ppi->ssim_vars);
+    ppi->ssim_vars = NULL;
   }
 }
 #endif  // CONFIG_INTERNAL_STATS
@@ -4481,7 +4453,16 @@
 
 static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
   // Increment the gf group index ready for the next frame.
-  ++cpi->gf_frame_index;
+  if (is_one_pass_rt_params(cpi) &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+    ++cpi->gf_frame_index;
+    // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH
+    // for real time encoding.
+    if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
+      cpi->gf_frame_index = 0;
+  } else {
+    ++cpi->gf_frame_index;
+  }
 }
 
 static void update_fb_of_context_type(const AV1_COMP *const cpi,
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 9b2b4ae..9d02993 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1442,6 +1442,8 @@
 
 typedef struct ThreadData {
   MACROBLOCK mb;
+  MvCosts *mv_costs_alloc;
+  IntraBCMVCosts *dv_costs_alloc;
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
   PC_TREE_SHARED_BUFFERS shared_coeff_buf;
@@ -1454,6 +1456,7 @@
   CONV_BUF_TYPE *tmp_conv_dst;
   uint64_t abs_sum_level;
   uint8_t *tmp_pred_bufs[2];
+  uint8_t *wiener_tmp_pred_buf;
   int intrabc_used;
   int deltaq_used;
   int coefficient_size;
@@ -1464,7 +1467,9 @@
   int32_t num_64x64_blocks;
   PICK_MODE_CONTEXT *firstpass_ctx;
   TemporalFilterData tf_data;
+  TplBuffers tpl_tmp_buffers;
   TplTxfmStats tpl_txfm_stats;
+  GlobalMotionData gm_data;
   // Pointer to the array of structures to store gradient information of each
   // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
   // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
@@ -1474,8 +1479,8 @@
   // store source variance and log of source variance of each 4x4 sub-block
   // for subsequent retrieval.
   Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
-  // The pc tree root for RTC non-rd case.
-  PC_TREE *rt_pc_root;
+  // Pointer to pc tree root.
+  PC_TREE *pc_root;
 } ThreadData;
 
 struct EncWorkerData;
@@ -1526,6 +1531,26 @@
    */
   int allocated_sb_rows;
 
+  /*!
+   * Initialized to false, set to true by the worker thread that encounters an
+   * error in order to abort the processing of other worker threads.
+   */
+  bool row_mt_exit;
+
+  /*!
+   * Initialized to false, set to true during first pass encoding by the worker
+   * thread that encounters an error in order to abort the processing of other
+   * worker threads.
+   */
+  bool firstpass_mt_exit;
+
+  /*!
+   * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker
+   * thread that encounters an error in order to abort the processing of other
+   * worker threads.
+   */
+  bool mb_wiener_mt_exit;
+
 #if CONFIG_MULTITHREAD
   /*!
    * Mutex lock used while dispatching jobs.
@@ -1619,6 +1644,45 @@
 } RestoreStateBuffers;
 
 /*!
+ * \brief Parameters related to restoration types.
+ */
+typedef struct {
+  /*!
+   * Stores the best coefficients for Wiener restoration.
+   */
+  WienerInfo wiener;
+
+  /*!
+   * Stores the best coefficients for Sgrproj restoration.
+   */
+  SgrprojInfo sgrproj;
+
+  /*!
+   * The rtype to use for this unit given a frame rtype as index. Indices:
+   * WIENER, SGRPROJ, SWITCHABLE.
+   */
+  RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+/*!
+ * \brief Structure to hold search parameter per restoration unit and
+ * intermediate buffer of Wiener filter used in pick filter stage of Loop
+ * restoration.
+ */
+typedef struct {
+  /*!
+   * Array of pointers to 'RestUnitSearchInfo' which holds data related to
+   * restoration types.
+   */
+  RestUnitSearchInfo *rusi[MAX_MB_PLANE];
+
+  /*!
+   * Buffer used to hold dgd-avg data during SIMD call of Wiener filter.
+   */
+  int16_t *dgd_avg;
+} AV1LrPickStruct;
+
+/*!
  * \brief Primary Encoder parameters related to multi-threading.
  */
 typedef struct PrimaryMultiThreadInfo {
@@ -2938,6 +3002,11 @@
   TemporalFilterCtx tf_ctx;
 
   /*!
+   * Pointer to CDEF search context.
+   */
+  CdefSearchCtx *cdef_search_ctx;
+
+  /*!
    * Variables related to forcing integer mv decisions for the current frame.
    */
   ForceIntegerMVInfo force_intpel_info;
@@ -3220,6 +3289,11 @@
   AV1LrStruct lr_ctxt;
 
   /*!
+   * Loop Restoration context used during pick stage.
+   */
+  AV1LrPickStruct pick_lr_ctxt;
+
+  /*!
    * Pointer to list of tables with film grain parameters.
    */
   aom_film_grain_table_t *film_grain_table;
@@ -3564,6 +3638,12 @@
    * fast encoding pass in av1_determine_sc_tools_with_encoding().
    */
   int palette_pixel_num;
+
+  /*!
+   * Flag to indicate scaled_last_source is available,
+   * so scaling is not needed for last_source.
+   */
+  int scaled_last_source_available;
 } AV1_COMP;
 
 /*!
@@ -3782,6 +3862,10 @@
 
 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
 
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td);
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td);
+
 // Set screen content options.
 // This function estimates whether to use screen content tools, by counting
 // the portion of blocks that have few luma colors.
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 9a1d60f..d0fd782 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -13,10 +13,13 @@
 #define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
 
 #include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
+#include "av1/encoder/global_motion_facade.h"
 #include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/pickcdef.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,11 +27,9 @@
 
 static AOM_INLINE void dealloc_context_buffers_ext(
     MBMIExtFrameBufferInfo *mbmi_ext_info) {
-  if (mbmi_ext_info->frame_base) {
-    aom_free(mbmi_ext_info->frame_base);
-    mbmi_ext_info->frame_base = NULL;
-    mbmi_ext_info->alloc_size = 0;
-  }
+  aom_free(mbmi_ext_info->frame_base);
+  mbmi_ext_info->frame_base = NULL;
+  mbmi_ext_info->alloc_size = 0;
 }
 
 static AOM_INLINE void alloc_context_buffers_ext(
@@ -64,14 +65,14 @@
 
   if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi);
 
-  if (cpi->td.mb.mv_costs) {
-    aom_free(cpi->td.mb.mv_costs);
-    cpi->td.mb.mv_costs = NULL;
-  }
-  // Avoid the memory allocation of 'mv_costs' for allintra encoding mode.
+  aom_free(cpi->td.mv_costs_alloc);
+  cpi->td.mv_costs_alloc = NULL;
+  // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding
+  // mode.
   if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
-    CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs,
-                    (MvCosts *)aom_calloc(1, sizeof(MvCosts)));
+    CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc,
+                    (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc)));
+    cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc;
   }
 
   av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
@@ -79,6 +80,9 @@
   av1_setup_sms_tree(cpi, &cpi->td);
   cpi->td.firstpass_ctx =
       av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+  if (!cpi->td.firstpass_ctx)
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PICK_MODE_CONTEXT");
 }
 
 // Allocate mbmi buffers which are used to store mode information at block
@@ -178,7 +182,7 @@
 static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   TokenInfo *token_info = &cpi->token_info;
-
+  const int num_planes = av1_num_planes(cm);
   dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
 
   aom_free(cpi->tile_data);
@@ -220,15 +224,25 @@
 
   release_obmc_buffers(&cpi->td.mb.obmc_buffer);
 
-  if (cpi->td.mb.mv_costs) {
-    aom_free(cpi->td.mb.mv_costs);
-    cpi->td.mb.mv_costs = NULL;
-  }
+  aom_free(cpi->td.mv_costs_alloc);
+  cpi->td.mv_costs_alloc = NULL;
+  aom_free(cpi->td.dv_costs_alloc);
+  cpi->td.dv_costs_alloc = NULL;
 
-  if (cpi->td.mb.dv_costs) {
-    aom_free(cpi->td.mb.dv_costs);
-    cpi->td.mb.dv_costs = NULL;
-  }
+  aom_free(cpi->td.mb.sb_stats_cache);
+  cpi->td.mb.sb_stats_cache = NULL;
+
+  aom_free(cpi->td.mb.sb_fp_stats);
+  cpi->td.mb.sb_fp_stats = NULL;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+  aom_free(cpi->td.mb.rdcost);
+  cpi->td.mb.rdcost = NULL;
+#endif
+
+  av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  cpi->td.pc_root = NULL;
 
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++) {
@@ -236,33 +250,55 @@
       cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
     }
 
+  av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table);
+
   aom_free(cm->tpl_mvs);
   cm->tpl_mvs = NULL;
 
-  if (cpi->td.pixel_gradient_info) {
-    aom_free(cpi->td.pixel_gradient_info);
-    cpi->td.pixel_gradient_info = NULL;
-  }
+  aom_free(cpi->td.pixel_gradient_info);
+  cpi->td.pixel_gradient_info = NULL;
 
-  if (cpi->td.src_var_info_of_4x4_sub_blocks) {
-    aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
-    cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
-  }
+  aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
+  cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
 
-  if (cpi->td.vt64x64) {
-    aom_free(cpi->td.vt64x64);
-    cpi->td.vt64x64 = NULL;
-  }
+  aom_free(cpi->td.vt64x64);
+  cpi->td.vt64x64 = NULL;
 
-  av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+  av1_free_pmc(cpi->td.firstpass_ctx, num_planes);
   cpi->td.firstpass_ctx = NULL;
 
+  const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+  // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+  // in av1_temporal_filter() for single-threaded encode are freed in case an
+  // error is encountered during temporal filtering (due to early termination
+  // tf_dealloc_data() in av1_temporal_filter() would not be invoked).
+  tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth);
+
+  // This call ensures that tpl_tmp_buffers for single-threaded encode are freed
+  // in case of an error during tpl.
+  tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers);
+
+  // This call ensures that the global motion (gm) data buffers for
+  // single-threaded encode are freed in case of an error during gm.
+  gm_dealloc_data(&cpi->td.gm_data);
+
+  // This call ensures that CDEF search context buffers are deallocated in case
+  // of an error during cdef search.
+  av1_cdef_dealloc_data(cpi->cdef_search_ctx);
+  aom_free(cpi->cdef_search_ctx);
+  cpi->cdef_search_ctx = NULL;
+
+  av1_dealloc_mb_data(&cpi->td.mb, num_planes);
+
+  av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+
   av1_free_txb_buf(cpi);
   av1_free_context_buffers(cm);
 
   aom_free_frame_buffer(&cpi->last_frame_uf);
 #if !CONFIG_REALTIME_ONLY
   av1_free_restoration_buffers(cm);
+  av1_free_firstpass_data(&cpi->firstpass_data);
 #endif
 
   if (!is_stat_generation_stage(cpi)) {
@@ -270,6 +306,13 @@
                           &cpi->mt_info.cdef_sync);
   }
 
+  for (int plane = 0; plane < num_planes; plane++) {
+    aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+    cpi->pick_lr_ctxt.rusi[plane] = NULL;
+  }
+  aom_free(cpi->pick_lr_ctxt.dgd_avg);
+  cpi->pick_lr_ctxt.dgd_avg = NULL;
+
   aom_free_frame_buffer(&cpi->trial_frame_rst);
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
@@ -304,16 +347,12 @@
   aom_free(cpi->svc.layer_context);
   cpi->svc.layer_context = NULL;
 
-  if (cpi->consec_zero_mv) {
-    aom_free(cpi->consec_zero_mv);
-    cpi->consec_zero_mv = NULL;
-    cpi->consec_zero_mv_alloc_size = 0;
-  }
+  aom_free(cpi->consec_zero_mv);
+  cpi->consec_zero_mv = NULL;
+  cpi->consec_zero_mv_alloc_size = 0;
 
-  if (cpi->src_sad_blk_64x64) {
-    aom_free(cpi->src_sad_blk_64x64);
-    cpi->src_sad_blk_64x64 = NULL;
-  }
+  aom_free(cpi->src_sad_blk_64x64);
+  cpi->src_sad_blk_64x64 = NULL;
 
   aom_free(cpi->mb_weber_stats);
   cpi->mb_weber_stats = NULL;
@@ -399,15 +438,23 @@
                        "Failed to reallocate scaled source buffer");
   assert(cpi->scaled_source.y_crop_width == scaled_width);
   assert(cpi->scaled_source.y_crop_height == scaled_height);
-  av1_resize_and_extend_frame_nonnormative(
-      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth,
-      num_planes);
+  if (!av1_resize_and_extend_frame_nonnormative(
+          cpi->unscaled_source, &cpi->scaled_source,
+          (int)cm->seq_params->bit_depth, num_planes))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to reallocate buffers during resize");
   return &cpi->scaled_source;
 }
 
 // Deallocate allocated thread_data.
 static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
   PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  const int num_tf_workers =
+      AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers);
+  const int num_tpl_workers =
+      AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers);
+  const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
+  const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE;
   for (int t = 1; t < p_mt_info->num_workers; ++t) {
     EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
     thread_data->td = thread_data->original_td;
@@ -429,12 +476,42 @@
         thread_data->td->hash_value_buffer[x][y] = NULL;
       }
     }
+    aom_free(thread_data->td->mv_costs_alloc);
+    thread_data->td->mv_costs_alloc = NULL;
+    aom_free(thread_data->td->dv_costs_alloc);
+    thread_data->td->dv_costs_alloc = NULL;
     aom_free(thread_data->td->counts);
-    av1_free_pmc(thread_data->td->firstpass_ctx,
-                 ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE);
+    av1_free_pmc(thread_data->td->firstpass_ctx, num_planes);
     thread_data->td->firstpass_ctx = NULL;
     av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf);
     av1_free_sms_tree(thread_data->td);
+    // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+    // in prepare_tf_workers() for MT encode are freed in case an error is
+    // encountered during temporal filtering (due to early termination
+    // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be
+    // invoked).
+    if (t < num_tf_workers)
+      tf_dealloc_data(&thread_data->td->tf_data, is_highbitdepth);
+    // This call ensures that tpl_tmp_buffers for MT encode are freed in case of
+    // an error during tpl.
+    if (t < num_tpl_workers)
+      tpl_dealloc_temp_buffers(&thread_data->td->tpl_tmp_buffers);
+    // This call ensures that the buffers in gm_data for MT encode are freed in
+    // case of an error during gm.
+    gm_dealloc_data(&thread_data->td->gm_data);
+    av1_dealloc_mb_data(&thread_data->td->mb, num_planes);
+    aom_free(thread_data->td->mb.sb_stats_cache);
+    thread_data->td->mb.sb_stats_cache = NULL;
+    aom_free(thread_data->td->mb.sb_fp_stats);
+    thread_data->td->mb.sb_fp_stats = NULL;
+#if CONFIG_PARTITION_SEARCH_ORDER
+    aom_free(thread_data->td->mb.rdcost);
+    thread_data->td->mb.rdcost = NULL;
+#endif
+    av1_free_pc_tree_recursive(thread_data->td->pc_root, num_planes, 0, 0,
+                               SEARCH_PARTITION);
+    thread_data->td->pc_root = NULL;
+    av1_dealloc_mb_wiener_var_pred_buf(thread_data->td);
     aom_free(thread_data->td);
   }
 }
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index bc136b1..c35873d 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -521,7 +521,12 @@
               cpi->ppi->p_rc.gfu_boost, gfu_boost,
               cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
         } else {
-          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
+          // TPL may only look at a subset of frame in the gf group when the
+          // speed feature 'reduce_num_frames' is on, which affects the r0
+          // calcuation. Thus, to compensate for TPL not using all frames a
+          // factor to adjust r0 is used.
+          const int gfu_boost =
+              (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0);
           cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
               MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
               cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
@@ -691,6 +696,25 @@
         continue;
       }
 
+      // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the
+      // motion search can be skipped for the references: last, golden, altref.
+      // If so, we can skip scaling that reference.
+      if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref &&
+          cpi->ppi->rtc_ref.set_ref_frame_config) {
+        if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue;
+        if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue;
+        if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref)
+          continue;
+      }
+      // For RTC with superres on: golden reference only needs to be scaled
+      // if it was refreshed in previous frame.
+      if (is_one_pass_rt_params(cpi) &&
+          cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME &&
+          cpi->rc.frame_num_last_gf_refresh <
+              (int)cm->current_frame.frame_number - 1) {
+        continue;
+      }
+
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         // Replace the reference buffer with a copy having a thicker border,
         // if the reference buffer is higher resolution than the current
@@ -745,19 +769,25 @@
           }
 #if CONFIG_AV1_HIGHBITDEPTH
           if (use_optimized_scaler && has_optimized_scaler &&
-              cm->seq_params->bit_depth == AOM_BITS_8)
+              cm->seq_params->bit_depth == AOM_BITS_8) {
             av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
                                         num_planes);
-          else
-            av1_resize_and_extend_frame_nonnormative(
-                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
+          } else if (!av1_resize_and_extend_frame_nonnormative(
+                         ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+                         num_planes)) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate buffer during resize");
+          }
 #else
-          if (use_optimized_scaler && has_optimized_scaler)
+          if (use_optimized_scaler && has_optimized_scaler) {
             av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
                                         num_planes);
-          else
-            av1_resize_and_extend_frame_nonnormative(
-                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
+          } else if (!av1_resize_and_extend_frame_nonnormative(
+                         ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+                         num_planes)) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate buffer during resize");
+          }
 #endif
           cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 92e69da..10ff92d 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -510,8 +510,8 @@
 #define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT)                 \
   HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                      \
               aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \
-              aom_highbd_obmc_variance##WIDTH##x##HEIGHT,    \
-              aom_highbd_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+              aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT,  \
+              aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
 
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
   ppi->fn_ptr[BT].osdf = OSDF;           \
@@ -1013,10 +1013,23 @@
 }
 
 static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
-  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  // Scaled references should only need to be released under certain conditions:
+  // if the reference will be updated, or if the scaled reference has same
+  // resolution. For now only apply this to Golden for non-svc RTC mode.
+  AV1_COMMON *const cm = &cpi->common;
+  const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0;
+  bool release_golden = true;
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
-    if (buf != NULL) {
+    const int golden_ref = (i == GOLDEN_FRAME - 1);
+    if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc &&
+        buf != NULL) {
+      const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME);
+      const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width &&
+                               buf->buf.y_crop_height == ref->buf.y_crop_height;
+      release_golden = refresh_golden || same_resoln;
+    }
+    if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) {
       --buf->ref_count;
       cpi->scaled_ref_buf[i] = NULL;
     }
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 430c6ae..24d47f3 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -9,15 +9,17 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
+
 #include "av1/common/warped_motion.h"
 #include "av1/common/thread_common.h"
 
 #include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encoder_alloc.h"
-#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/ethread.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/firstpass.h"
@@ -149,7 +151,13 @@
   if (sig) {
     pthread_mutex_lock(&row_mt_sync->mutex_[r]);
 
-    row_mt_sync->num_finished_cols[r] = cur;
+    // When a thread encounters an error, num_finished_cols[r] is set to maximum
+    // column number. In this case, the AOMMAX operation here ensures that
+    // num_finished_cols[r] is not overwritten with a smaller value thus
+    // preventing the infinite waiting of threads in the relevant sync_read()
+    // function.
+    row_mt_sync->num_finished_cols[r] =
+        AOMMAX(row_mt_sync->num_finished_cols[r], cur);
 
     pthread_cond_signal(&row_mt_sync->cond_[r]);
     pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
@@ -194,7 +202,7 @@
 }
 
 // Deallocate row based multi-threading synchronization related mutex and data
-static void row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
   if (row_mt_sync != NULL) {
 #if CONFIG_MULTITHREAD
     int i;
@@ -264,6 +272,8 @@
   enc_row_mt->allocated_rows = max_rows;
   enc_row_mt->allocated_cols = max_cols - 1;
   enc_row_mt->allocated_sb_rows = sb_rows;
+  enc_row_mt->row_mt_exit = false;
+  enc_row_mt->firstpass_mt_exit = false;
 }
 
 void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
@@ -278,7 +288,7 @@
       int tile_index = tile_row * tile_cols + tile_col;
       TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
 
-      row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+      av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
 
       if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx);
     }
@@ -391,18 +401,68 @@
 }
 
 #if !CONFIG_REALTIME_ONLY
+static void set_firstpass_encode_done(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  const int unit_height = mi_size_high[fp_block_size];
+
+  // In case of multithreading of firstpass encode, due to top-right
+  // dependency, the worker on a firstpass row waits for the completion of the
+  // firstpass processing of the top and top-right fp_blocks. Hence, in case a
+  // thread (main/worker) encounters an error, update the firstpass processing
+  // of every row in the frame to indicate that it is complete in order to avoid
+  // dependent workers waiting indefinitely.
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *tile = &tile_data->tile_info;
+      AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+      const int unit_cols_in_tile =
+          av1_get_unit_cols_in_tile(tile, fp_block_size);
+      for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0;
+           mi_row < tile->mi_row_end;
+           mi_row += unit_height, unit_row_in_tile++) {
+        enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile,
+                                   unit_cols_in_tile - 1, unit_cols_in_tile);
+      }
+    }
+  }
+}
+
 static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
-  AV1_COMMON *const cm = &cpi->common;
   int thread_id = thread_data->thread_id;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
-  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
 #endif
   (void)unused;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  xd->error_info = error_info;
 
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+    enc_row_mt->firstpass_mt_exit = true;
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    set_firstpass_encode_done(cpi);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  AV1_COMMON *const cm = &cpi->common;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
   assert(cur_tile_id != -1);
 
   const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
@@ -413,8 +473,9 @@
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
-    if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
-                      unit_height)) {
+    bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+    if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id],
+                                            &current_mi_row, unit_height)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
       switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
@@ -424,7 +485,9 @@
 #if CONFIG_MULTITHREAD
     pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
-    if (end_of_frame == 1) break;
+    // When firstpass_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (firstpass_mt_exit || end_of_frame) break;
 
     TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
     AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
@@ -444,7 +507,7 @@
     pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
   }
-
+  error_info->setjmp = 0;
   return 1;
 }
 #endif
@@ -455,6 +518,7 @@
   AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
   const int sb_rows = get_sb_rows_in_frame(cm);
   AV1LfMTInfo *cur_job_info;
+  bool row_mt_exit = false;
   (void)enc_row_mt;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
@@ -469,43 +533,116 @@
     const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
     // Wait for current and next superblock row to finish encoding.
     pthread_mutex_lock(enc_row_mt_mutex_);
-    while (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
-           enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols) {
+    while (!enc_row_mt->row_mt_exit &&
+           (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+            enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) {
       pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
     }
+    row_mt_exit = enc_row_mt->row_mt_exit;
     pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
+    if (row_mt_exit) return;
+
     av1_thread_loop_filter_rows(
         lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
         cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
-        lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
-        mib_size_log2);
+        lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf,
+        lf_data->tx_buf, mib_size_log2);
+  }
+}
+
+static void set_encoding_done(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int mib_size = cm->seq_params->mib_size;
+
+  // In case of row-multithreading, due to top-right dependency, the worker on
+  // an SB row waits for the completion of the encode of the top and top-right
+  // SBs. Hence, in case a thread (main/worker) encounters an error, update that
+  // encoding of every SB row in the frame is complete in order to avoid the
+  // dependent workers of every tile from waiting indefinitely.
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileDataEnc *const this_tile =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      const TileInfo *const tile_info = &this_tile->tile_info;
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+      const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+      for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0;
+           mi_row < tile_info->mi_row_end;
+           mi_row += mib_size, sb_row_in_tile++) {
+        enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile,
+                                   sb_cols_in_tile - 1, sb_cols_in_tile);
+      }
+    }
   }
 }
 
 static int enc_row_mt_worker_hook(void *arg1, void *unused) {
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
-  AV1_COMMON *const cm = &cpi->common;
   int thread_id = thread_data->thread_id;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
-  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
-  const int mib_size_log2 = cm->seq_params->mib_size_log2;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
 #endif
   (void)unused;
+
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  AV1LfSync *const lf_sync = thread_data->lf_sync;
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+    enc_row_mt->row_mt_exit = true;
+    // Wake up all the workers waiting in launch_loop_filter_rows() to exit in
+    // case of an error.
+    pthread_cond_broadcast(enc_row_mt->cond_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    set_encoding_done(cpi);
+
+    if (cpi->mt_info.pipeline_lpf_mt_with_enc) {
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(lf_sync->job_mutex);
+      lf_sync->lf_mt_exit = true;
+      pthread_mutex_unlock(lf_sync->job_mutex);
+#endif
+      av1_set_vert_loop_filter_done(&cpi->common, lf_sync,
+                                    cpi->common.seq_params->mib_size_log2);
+    }
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+
   // Preallocate the pc_tree for realtime coding to reduce the cost of memory
   // allocation.
-  thread_data->td->rt_pc_root =
-      cpi->sf.rt_sf.use_nonrd_pick_mode
-          ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
-          : NULL;
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+    if (!thread_data->td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+  } else {
+    thread_data->td->pc_root = NULL;
+  }
 
   assert(cur_tile_id != -1);
 
   const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
   int end_of_frame = 0;
+  bool row_mt_exit = false;
 
   // When master thread does not have a valid job to process, xd->tile_ctx
   // is not set and it contains NULL pointer. This can result in NULL pointer
@@ -518,7 +655,12 @@
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
-    if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+    row_mt_exit = enc_row_mt->row_mt_exit;
+    // row_mt_exit check here can be avoided as it is checked after
+    // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here,
+    // tries to return before calling the function get_next_job().
+    if (!row_mt_exit &&
+        !get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
                       cm->seq_params->mib_size)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
@@ -529,7 +671,14 @@
 #if CONFIG_MULTITHREAD
     pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
-    if (end_of_frame == 1) break;
+    // When row_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (row_mt_exit) {
+      error_info->setjmp = 0;
+      return 1;
+    }
+
+    if (end_of_frame) break;
 
     TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
     AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
@@ -583,26 +732,46 @@
     // encoding and loop filter stage.
     launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
   }
-  av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
-                             0, cpi->sf.part_sf.partition_search_type);
+  av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  thread_data->td->pc_root = NULL;
+  error_info->setjmp = 0;
   return 1;
 }
 
 static int enc_worker_hook(void *arg1, void *unused) {
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
   const AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   int t;
 
   (void)unused;
+
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+    return 0;
+  }
+  error_info->setjmp = 1;
+
   // Preallocate the pc_tree for realtime coding to reduce the cost of memory
   // allocation.
-  thread_data->td->rt_pc_root =
-      cpi->sf.rt_sf.use_nonrd_pick_mode
-          ? av1_alloc_pc_tree_node(cm->seq_params->sb_size)
-          : NULL;
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+    if (!thread_data->td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+  } else {
+    thread_data->td->pc_root = NULL;
+  }
 
   for (t = thread_data->start; t < tile_rows * tile_cols;
        t += cpi->mt_info.num_workers) {
@@ -616,9 +785,10 @@
     av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
-  av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
-                             0, cpi->sf.part_sf.partition_search_type);
-
+  av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+                             cpi->sf.part_sf.partition_search_type);
+  thread_data->td->pc_root = NULL;
+  error_info->setjmp = 0;
   return 1;
 }
 
@@ -651,10 +821,11 @@
   AV1_COMMON *const cm = &cpi->common;
   AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
   if (lr_sync->sync_range) {
-    int num_lr_workers =
-        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
     if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
       return;
+    int num_lr_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+    assert(num_lr_workers <= lr_sync->num_workers);
     lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
     lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
   }
@@ -720,16 +891,21 @@
       av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
     }
 
+    // Initialize tpl MT object.
+    AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt;
+    if (tpl_row_mt->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_,
+                      aom_malloc(sizeof(*(tpl_row_mt->mutex_))));
+      if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL);
+    }
+    tpl_row_mt->tpl_mt_exit = false;
+
 #if !CONFIG_REALTIME_ONLY
     if (is_restoration_used(cm)) {
       // Initialize loop restoration MT object.
       AV1LrSync *lr_sync = &mt_info->lr_row_sync;
-      int rst_unit_size;
-      if (cm->width * cm->height > 352 * 288)
-        rst_unit_size = RESTORATION_UNITSIZE_MAX;
-      else
-        rst_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
-      int num_rows_lr = av1_lr_count_units_in_tile(rst_unit_size, cm->height);
+      int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+      int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height);
       int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
       if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
           num_lr_workers > lr_sync->num_workers ||
@@ -754,7 +930,7 @@
 
 // Computes the number of workers to be considered while allocating memory for a
 // multi-threaded module under FPMT.
-int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
                                       MULTI_THREADED_MODULES mod_name) {
   int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
   if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
@@ -798,6 +974,9 @@
         // Set up firstpass PICK_MODE_CONTEXT.
         thread_data->td->firstpass_ctx = av1_alloc_pmc(
             ppi->cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
+        if (!thread_data->td->firstpass_ctx)
+          aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
       }
 
       if (!is_first_pass && i < num_enc_workers) {
@@ -1111,6 +1290,7 @@
     if (is_restoration_used(cm)) {
       // Back up the original LR buffers before update.
       int idx = i + mt_info->num_workers - 1;
+      assert(idx < mt_info->lr_row_sync.num_workers);
       mt_info->restore_state_buf.rst_tmpbuf =
           mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
       mt_info->restore_state_buf.rlbs =
@@ -1153,27 +1333,6 @@
   }
 }
 
-// Synchronize level 1 workers.
-static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi) {
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  int num_workers = ppi->p_mt_info.p_num_workers;
-  int had_error = 0;
-  // Points to error in the earliest display order frame in the parallel set.
-  const struct aom_internal_error_info *error;
-
-  // Encoding ends.
-  for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
-    if (!winterface->sync(worker)) {
-      had_error = 1;
-      error = ((AV1_COMP *)worker->data1)->common.error;
-    }
-  }
-
-  if (had_error)
-    aom_internal_error(&ppi->error, error->error_code, "%s", error->detail);
-}
-
 // Restore worker states after parallel encode.
 static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
                                                   int parallel_frame_count) {
@@ -1203,6 +1362,7 @@
     if (is_restoration_used(cm)) {
       // Restore the original LR buffers.
       int idx = i + mt_info->num_workers - 1;
+      assert(idx < mt_info->lr_row_sync.num_workers);
       mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
           mt_info->restore_state_buf.rst_tmpbuf;
       mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
@@ -1215,6 +1375,30 @@
   }
 }
 
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi,
+                                         int frames_in_parallel_set) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_workers = ppi->p_mt_info.p_num_workers;
+  int had_error = 0;
+  // Points to error in the earliest display order frame in the parallel set.
+  const struct aom_internal_error_info *error;
+
+  // Encoding ends.
+  for (int i = num_workers - 1; i >= 0; --i) {
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error = ppi->parallel_cpi[i]->common.error;
+    }
+  }
+
+  restore_workers_after_fpmt(ppi, frames_in_parallel_set);
+
+  if (had_error)
+    aom_internal_error(&ppi->error, error->error_code, "%s", error->detail);
+}
+
 static int get_compressed_data_hook(void *arg1, void *arg2) {
   AV1_COMP *cpi = (AV1_COMP *)arg1;
   AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
@@ -1236,8 +1420,7 @@
   prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
                        frames_in_parallel_set);
   launch_fpmt_workers(ppi);
-  sync_fpmt_workers(ppi);
-  restore_workers_after_fpmt(ppi, frames_in_parallel_set);
+  sync_fpmt_workers(ppi, frames_in_parallel_set);
 
   // Release cpi->scaled_ref_buf corresponding to frames in the current parallel
   // encode set.
@@ -1254,6 +1437,7 @@
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
+    worker->had_error = 0;
     if (i == 0)
       winterface->execute(worker);
     else
@@ -1264,17 +1448,33 @@
 static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
                                         AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  int had_error = mt_info->workers[0].had_error;
+  const AVxWorker *const worker_main = &mt_info->workers[0];
+  int had_error = worker_main->had_error;
+  struct aom_internal_error_info error_info;
+
+  // Read the error_info of main thread.
+  if (had_error) {
+    error_info = ((EncWorkerData *)worker_main->data1)->error_info;
+  }
 
   // Encoding ends.
   for (int i = num_workers - 1; i > 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
-    had_error |= !winterface->sync(worker);
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error_info = ((EncWorkerData *)worker->data1)->error_info;
+    }
   }
 
   if (had_error)
-    aom_internal_error(cm->error, AOM_CODEC_ERROR,
-                       "Failed to encode tile data");
+    aom_internal_error(cm->error, error_info.error_code, "%s",
+                       error_info.detail);
+
+  // Restore xd->error_info of the main thread back to cm->error so that the
+  // multithreaded code, when executed using a single thread, has a valid
+  // xd->error_info.
+  MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd;
+  xd->error_info = cm->error;
 }
 
 static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
@@ -1292,13 +1492,15 @@
       // Keep these conditional expressions in sync with the corresponding ones
       // in prepare_enc_workers().
       if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
-        aom_free(thread_data->td->mb.mv_costs);
+        aom_free(thread_data->td->mv_costs_alloc);
+        thread_data->td->mv_costs_alloc = NULL;
       }
       if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
-        aom_free(thread_data->td->mb.dv_costs);
+        aom_free(thread_data->td->dv_costs_alloc);
+        thread_data->td->dv_costs_alloc = NULL;
       }
     }
-    av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb);
+    av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common));
 
     // Accumulate counters.
     if (i > 0) {
@@ -1362,8 +1564,10 @@
       // Keep these conditional expressions in sync with the corresponding ones
       // in accumulate_counters_enc_workers().
       if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
-        CHECK_MEM_ERROR(cm, thread_data->td->mb.mv_costs,
-                        (MvCosts *)aom_malloc(sizeof(MvCosts)));
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->mv_costs_alloc,
+            (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc)));
+        thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc;
         memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
                sizeof(MvCosts));
       }
@@ -1373,8 +1577,10 @@
         // aom_free() call for the same.
         thread_data->td->mb.dv_costs = NULL;
         if (av1_need_dv_costs(cpi)) {
-          CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
-                          (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+          CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc,
+                          (IntraBCMVCosts *)aom_malloc(
+                              sizeof(*thread_data->td->dv_costs_alloc)));
+          thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc;
           memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
                  sizeof(IntraBCMVCosts));
         }
@@ -1438,26 +1644,17 @@
       thread_data->td = thread_data->original_td;
     }
 
-    // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
+      // Before encoding a frame, copy the thread data from cpi.
       thread_data->td->mb = cpi->td.mb;
-      // Keep this conditional expression in sync with the corresponding one
-      // in av1_fp_encode_tiles_row_mt().
-      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
-        CHECK_MEM_ERROR(cm, thread_data->td->mb.mv_costs,
-                        (MvCosts *)aom_malloc(sizeof(MvCosts)));
-        memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
-               sizeof(MvCosts));
-      }
+      av1_alloc_src_diff_buf(cm, &thread_data->td->mb);
     }
-
-    av1_alloc_mb_data(cpi, &thread_data->td->mb);
   }
 }
 #endif
 
 // Computes the number of workers for row multi-threading of encoding stage
-static AOM_INLINE int compute_num_enc_row_mt_workers(AV1_COMMON *const cm,
+static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
                                                      int max_threads) {
   TileInfo tile_info;
   const int tile_cols = cm->tiles.cols;
@@ -1476,7 +1673,7 @@
 }
 
 // Computes the number of workers for tile multi-threading of encoding stage
-static AOM_INLINE int compute_num_enc_tile_mt_workers(AV1_COMMON *const cm,
+static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
                                                       int max_threads) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
@@ -1494,7 +1691,7 @@
 }
 
 // Computes the number of workers for encoding stage (row/tile multi-threading)
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
   if (max_workers <= 1) return 1;
   if (cpi->oxcf.row_mt)
     return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
@@ -1752,6 +1949,15 @@
 }
 
 #if !CONFIG_REALTIME_ONLY
+static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) {
+  for (int i = num_workers - 1; i >= 0; --i) {
+    EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+    if (thread_data->td != &cpi->td)
+      av1_dealloc_src_diff_buf(&thread_data->td->mb,
+                               av1_num_planes(&cpi->common));
+  }
+}
+
 void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
@@ -1814,18 +2020,7 @@
   fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
-  for (int i = num_workers - 1; i >= 0; i--) {
-    EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
-    if (thread_data->td != &cpi->td) {
-      // Keep this conditional expression in sync with the corresponding one
-      // in fp_prepare_enc_workers().
-      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
-        aom_free(thread_data->td->mb.mv_costs);
-      }
-      assert(!thread_data->td->mb.dv_costs);
-    }
-    av1_dealloc_mb_data(cm, &thread_data->td->mb);
-  }
+  dealloc_thread_data_src_diff_buf(cpi, num_workers);
 }
 
 void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
@@ -1894,6 +2089,27 @@
 #endif  // CONFIG_MULTITHREAD
 }
 
+static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const BLOCK_SIZE bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  const int mi_height = mi_size_high[bsize];
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+  const int tplb_cols_in_tile =
+      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+  // In case of tpl row-multithreading, due to top-right dependency, the worker
+  // on an mb_row waits for the completion of the tpl processing of the top and
+  // top-right blocks. Hence, in case a thread (main/worker) encounters an
+  // error, update that the tpl processing of every mb_row in the frame is
+  // complete in order to avoid dependent workers waiting indefinitely.
+  for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows;
+       mi_row += mi_height, tplb_row++) {
+    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                  tplb_cols_in_tile - 1, tplb_cols_in_tile);
+  }
+}
+
 // Each worker calls tpl_worker_hook() and computes the tpl data.
 static int tpl_worker_hook(void *arg1, void *unused) {
   (void)unused;
@@ -1903,11 +2119,36 @@
   MACROBLOCK *x = &thread_data->td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
   TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
+  TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers;
   CommonModeInfoParams *mi_params = &cm->mi_params;
+  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+  (void)tpl_row_mt;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_;
+#endif
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(tpl_error_mutex_);
+    tpl_row_mt->tpl_mt_exit = true;
+    pthread_mutex_unlock(tpl_error_mutex_);
+#endif
+    set_mode_estimation_done(cpi);
+    return 0;
+  }
+  error_info->setjmp = 1;
+
   BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   TX_SIZE tx_size = max_txsize_lookup[bsize];
   int mi_height = mi_size_high[bsize];
-  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
 
   av1_init_tpl_txfm_stats(tpl_txfm_stats);
 
@@ -1919,8 +2160,10 @@
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size);
+    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row,
+                              bsize, tx_size);
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
@@ -2005,6 +2248,11 @@
       // OBMC buffers are used only to init MS params and remain unused when
       // called from tpl, hence set the buffers to defaults.
       av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers,
+                                  cpi->ppi->tpl_data.tpl_bsize_1d)) {
+        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating tpl data");
+      }
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
     }
@@ -2056,6 +2304,11 @@
 #if CONFIG_BITRATE_ACCURACY
   tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
 #endif  // CONFIG_BITRATE_ACCURACY
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+  }
 }
 
 // Deallocate memory for temporal filter multi-thread synchronization.
@@ -2079,7 +2332,7 @@
   pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
   pthread_mutex_lock(tf_mutex_);
 #endif
-  if (tf_mt_sync->next_tf_row < mb_rows) {
+  if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) {
     *current_mb_row = tf_mt_sync->next_tf_row;
     tf_mt_sync->next_tf_row++;
     do_next_row = 1;
@@ -2099,6 +2352,28 @@
   TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
   AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
   const struct scale_factors *scale = &cpi->tf_ctx.sf;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tf_mutex_ = tf_sync->mutex_;
+#endif
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(tf_mutex_);
+    tf_sync->tf_mt_exit = true;
+    pthread_mutex_unlock(tf_mutex_);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
   const int num_planes = av1_num_planes(&cpi->common);
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
@@ -2115,6 +2390,7 @@
 
   tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
 
+  error_info->setjmp = 0;
   return 1;
 }
 
@@ -2123,6 +2399,7 @@
                                int num_workers, int is_highbitdepth) {
   MultiThreadInfo *mt_info = &cpi->mt_info;
   mt_info->tf_sync.next_tf_row = 0;
+  mt_info->tf_sync.tf_mt_exit = false;
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *worker = &mt_info->workers[i];
     EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
@@ -2227,19 +2504,6 @@
   get_next_gm_job(cpi, frame_idx, *(cur_dir));
 }
 
-// Initializes inliers, num_inliers and segment_map.
-static AOM_INLINE void init_gm_thread_data(
-    const GlobalMotionInfo *gm_info, GlobalMotionThreadData *thread_data) {
-  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-    MotionModel motion_params = thread_data->motion_models[m];
-    av1_zero(motion_params.params);
-    motion_params.num_inliers = 0;
-  }
-
-  av1_zero_array(thread_data->segment_map,
-                 gm_info->segment_map_w * gm_info->segment_map_h);
-}
-
 // Hook function for each thread in global motion multi-threading.
 static int gm_mt_worker_hook(void *arg1, void *unused) {
   (void)unused;
@@ -2247,16 +2511,34 @@
   EncWorkerData *thread_data = (EncWorkerData *)arg1;
   AV1_COMP *cpi = thread_data->cpi;
   GlobalMotionInfo *gm_info = &cpi->gm_info;
-  MultiThreadInfo *mt_info = &cpi->mt_info;
-  JobInfo *job_info = &mt_info->gm_sync.job_info;
+  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+  JobInfo *job_info = &gm_sync->job_info;
   int thread_id = thread_data->thread_id;
-  GlobalMotionThreadData *gm_thread_data =
-      &mt_info->gm_sync.thread_data[thread_id];
-  int cur_dir = job_info->thread_id_to_dir[thread_id];
+  GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_;
 #endif
 
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+    gm_sync->gm_mt_exit = true;
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  int cur_dir = job_info->thread_id_to_dir[thread_id];
+  bool gm_mt_exit = false;
   while (1) {
     int ref_buf_idx = -1;
 
@@ -2264,9 +2546,10 @@
     pthread_mutex_lock(gm_mt_mutex_);
 #endif
 
+    gm_mt_exit = gm_sync->gm_mt_exit;
     // Populates ref_buf_idx(the reference frame type) for which global motion
     // estimation will be done.
-    if (!get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+    if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
       // No jobs are available for the current direction. Switch
       // to other direction and get the next job, if available.
       switch_direction(cpi, &ref_buf_idx, &cur_dir);
@@ -2276,15 +2559,15 @@
     pthread_mutex_unlock(gm_mt_mutex_);
 #endif
 
-    if (ref_buf_idx == -1) break;
-
-    init_gm_thread_data(gm_info, gm_thread_data);
+    // When gm_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
+    if (gm_mt_exit || ref_buf_idx == -1) break;
 
     // Compute global motion for the given ref_buf_idx.
     av1_compute_gm_for_valid_ref_frames(
-        cpi, gm_info->ref_buf, ref_buf_idx, gm_thread_data->motion_models,
-        gm_thread_data->segment_map, gm_info->segment_map_w,
-        gm_info->segment_map_h);
+        cpi, error_info, gm_info->ref_buf, ref_buf_idx,
+        gm_thread_data->motion_models, gm_thread_data->segment_map,
+        gm_info->segment_map_w, gm_info->segment_map_h);
 
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(gm_mt_mutex_);
@@ -2300,6 +2583,7 @@
     pthread_mutex_unlock(gm_mt_mutex_);
 #endif
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
@@ -2307,6 +2591,7 @@
 static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                           int num_workers) {
   MultiThreadInfo *mt_info = &cpi->mt_info;
+  mt_info->gm_sync.gm_mt_exit = false;
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *worker = &mt_info->workers[i];
     EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
@@ -2325,6 +2610,9 @@
     } else {
       thread_data->td = thread_data->original_td;
     }
+
+    if (thread_data->td != &cpi->td)
+      gm_alloc_data(cpi, &thread_data->td->gm_data);
   }
 }
 
@@ -2351,69 +2639,28 @@
 }
 
 // Frees the memory allocated for each worker in global motion multi-threading.
-void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data) {
-  if (gm_sync_data->thread_data != NULL) {
-    for (int j = 0; j < gm_sync_data->allocated_workers; j++) {
-      GlobalMotionThreadData *thread_data = &gm_sync_data->thread_data[j];
-      aom_free(thread_data->segment_map);
-
-      for (int m = 0; m < RANSAC_NUM_MOTIONS; m++)
-        aom_free(thread_data->motion_models[m].inliers);
-    }
-    aom_free(gm_sync_data->thread_data);
-  }
-}
-
-// Allocates memory for inliers and segment_map for each worker in global motion
-// multi-threading.
-static AOM_INLINE void gm_alloc(AV1_COMP *cpi, int num_workers) {
-  AV1_COMMON *cm = &cpi->common;
-  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
-  GlobalMotionInfo *gm_info = &cpi->gm_info;
-
-  gm_sync->allocated_workers = num_workers;
-  gm_sync->allocated_width = cpi->source->y_width;
-  gm_sync->allocated_height = cpi->source->y_height;
-
-  CHECK_MEM_ERROR(cm, gm_sync->thread_data,
-                  aom_malloc(sizeof(*gm_sync->thread_data) * num_workers));
-
-  for (int i = 0; i < num_workers; i++) {
-    GlobalMotionThreadData *thread_data = &gm_sync->thread_data[i];
-    CHECK_MEM_ERROR(
-        cm, thread_data->segment_map,
-        aom_malloc(sizeof(*thread_data->segment_map) * gm_info->segment_map_w *
-                   gm_info->segment_map_h));
-
-    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-      CHECK_MEM_ERROR(
-          cm, thread_data->motion_models[m].inliers,
-          aom_malloc(sizeof(*thread_data->motion_models[m].inliers) * 2 *
-                     MAX_CORNERS));
-    }
+static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int j = 0; j < num_workers; j++) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) gm_dealloc_data(&td->gm_data);
   }
 }
 
 // Implements multi-threading for global motion.
 void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
-  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
-  JobInfo *job_info = &gm_sync->job_info;
+  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
 
   av1_zero(*job_info);
 
   int num_workers = compute_gm_workers(cpi);
 
-  if (num_workers > gm_sync->allocated_workers ||
-      cpi->source->y_width != gm_sync->allocated_width ||
-      cpi->source->y_height != gm_sync->allocated_height) {
-    av1_gm_dealloc(gm_sync);
-    gm_alloc(cpi, num_workers);
-  }
-
   assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
   prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+  gm_dealloc_thread_data(cpi, num_workers);
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -2455,10 +2702,33 @@
 
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
+      av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td);
     }
   }
 }
 
+static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  const BLOCK_SIZE bsize = cpi->weber_bsize;
+  const int mb_step = mi_size_wide[bsize];
+  assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL);
+  const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+  const int mt_unit_cols =
+      (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+  const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt;
+  AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+      &cpi->ppi->intra_row_mt_sync;
+
+  // Update the wiener variance computation of every row in the frame to
+  // indicate that it is complete in order to avoid dependent workers waiting
+  // indefinitely.
+  for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows;
+       mi_row += mb_step, ++mt_thread_id) {
+    intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+                                   mt_unit_cols - 1, mt_unit_cols);
+  }
+}
+
 static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
   (void)unused;
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
@@ -2472,42 +2742,74 @@
   AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
   (void)enc_row_mt;
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+  pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_;
 #endif
+
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex);
+    enc_row_mt->mb_wiener_mt_exit = true;
+    pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+    set_mb_wiener_var_calc_done(cpi);
+    return 0;
+  }
+  error_info->setjmp = 1;
   DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
   DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
   DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
   DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
   double sum_rec_distortion = 0;
   double sum_est_rate = 0;
-  int has_jobs = 1;
-  while (has_jobs) {
+  while (1) {
     int current_mi_row = -1;
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(enc_row_mt_mutex_);
+    pthread_mutex_lock(enc_row_mt_mutex);
 #endif
-    has_jobs =
-        get_next_job_allintra(intra_row_mt_sync, cpi->common.mi_params.mi_rows,
-                              &current_mi_row, mb_step);
+    int has_jobs = enc_row_mt->mb_wiener_mt_exit
+                       ? 0
+                       : get_next_job_allintra(intra_row_mt_sync,
+                                               cpi->common.mi_params.mi_rows,
+                                               &current_mi_row, mb_step);
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(enc_row_mt_mutex_);
+    pthread_mutex_unlock(enc_row_mt_mutex);
 #endif
     if (!has_jobs) break;
     // TODO(chengchen): properly accumulate the distortion and rate.
     av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
                                qcoeff, dqcoeff, &sum_rec_distortion,
-                               &sum_est_rate);
+                               &sum_est_rate,
+                               thread_data->td->wiener_tmp_pred_buf);
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(enc_row_mt_mutex_);
+    pthread_mutex_lock(enc_row_mt_mutex);
 #endif
     intra_row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(enc_row_mt_mutex_);
+    pthread_mutex_unlock(enc_row_mt_mutex);
 #endif
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
+static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) {
+  av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int j = 0; j < num_workers; ++j) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td);
+  }
+}
+
 // This function is the multi-threading version of computing the wiener
 // variance.
 // Note that the wiener variance is used for allintra mode (1 pass) and its
@@ -2533,12 +2835,12 @@
   intra_row_mt_sync->next_mi_row = 0;
   memset(intra_row_mt_sync->num_finished_cols, -1,
          sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows);
+  mt_info->enc_row_mt.mb_wiener_mt_exit = false;
 
   prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
   launch_workers(mt_info, num_workers);
   sync_enc_workers(mt_info, cm, num_workers);
-
-  row_mt_sync_mem_dealloc(intra_row_mt_sync);
+  dealloc_mb_wiener_var_mt_data(cpi, num_workers);
 }
 
 // Compare and order tiles based on absolute sum of tx coeffs.
@@ -2713,14 +3015,40 @@
   const CommonTileParams *const tiles = &cm->tiles;
   const int num_tiles = tiles->cols * tiles->rows;
 
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_;
+#endif
+  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  xd->error_info = error_info;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_mutex);
+    pack_bs_sync->pack_bs_mt_exit = true;
+    pthread_mutex_unlock(pack_bs_mutex);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
   while (1) {
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(pack_bs_sync->mutex_);
+    pthread_mutex_lock(pack_bs_mutex);
 #endif
-    const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+    const int tile_idx =
+        pack_bs_sync->pack_bs_mt_exit
+            ? -1
+            : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(pack_bs_sync->mutex_);
+    pthread_mutex_unlock(pack_bs_mutex);
 #endif
+    // When pack_bs_mt_exit is set to true, other workers need not pursue any
+    // further jobs.
     if (tile_idx == -1) break;
     TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
     thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
@@ -2728,6 +3056,7 @@
     av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
   }
 
+  error_info->setjmp = 0;
   return 1;
 }
 
@@ -2902,8 +3231,9 @@
 // populates next job information and returns 1, else returns 0.
 static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
                                         CdefSearchCtx *cdef_search_ctx,
-                                        int *cur_fbr, int *cur_fbc,
-                                        int *sb_count) {
+                                        volatile int *cur_fbr,
+                                        volatile int *cur_fbc,
+                                        volatile int *sb_count) {
 #if CONFIG_MULTITHREAD
   pthread_mutex_lock(cdef_sync->mutex_);
 #endif  // CONFIG_MULTITHREAD
@@ -2913,15 +3243,15 @@
 
   // If a block is skip, do not process the block and
   // check the skip condition for the next block.
-  while ((!cdef_sync->end_of_frame) &&
-         (cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
-                       cdef_sync->fbc))) {
+  while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame &&
+         cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+                      cdef_sync->fbc)) {
     update_next_job_info(cdef_sync, nvfb, nhfb);
   }
 
   // Populates information needed for current job and update the row,
   // column indices of the next block to be processed.
-  if (cdef_sync->end_of_frame == 0) {
+  if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
     do_next_block = 1;
     *cur_fbr = cdef_sync->fbr;
     *cur_fbc = cdef_sync->fbc;
@@ -2937,43 +3267,68 @@
 
 // Hook function for each thread in CDEF search multi-threading.
 static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
-  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
-  CdefSearchCtx *cdef_search_ctx = (CdefSearchCtx *)arg2;
-  int cur_fbr, cur_fbc, sb_count;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_;
+#endif
+  struct aom_internal_error_info *const error_info = &thread_data->error_info;
+  CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(error_info->jmp)) {
+    error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(cdef_mutex_);
+    cdef_sync->cdef_mt_exit = true;
+    pthread_mutex_unlock(cdef_mutex_);
+#endif
+    return 0;
+  }
+  error_info->setjmp = 1;
+
+  volatile int cur_fbr, cur_fbc, sb_count;
   while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
                            &sb_count)) {
-    av1_cdef_mse_calc_block(cdef_search_ctx, cur_fbr, cur_fbc, sb_count);
+    av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc,
+                            sb_count);
   }
+  error_info->setjmp = 0;
   return 1;
 }
 
 // Assigns CDEF search hook function and thread data to each worker.
-static void prepare_cdef_workers(MultiThreadInfo *mt_info,
-                                 CdefSearchCtx *cdef_search_ctx,
-                                 AVxWorkerHook hook, int num_workers) {
+static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                 int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    thread_data->cpi = cpi;
     worker->hook = hook;
-    worker->data1 = &mt_info->cdef_sync;
-    worker->data2 = cdef_search_ctx;
+    worker->data1 = thread_data;
+    worker->data2 = &mt_info->cdef_sync;
   }
 }
 
 // Implements multi-threading for CDEF search.
-void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
-                                CdefSearchCtx *cdef_search_ctx) {
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
   AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
   const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
 
   cdef_reset_job_info(cdef_sync);
-  prepare_cdef_workers(mt_info, cdef_search_ctx, cdef_filter_block_worker_hook,
-                       num_workers);
+  prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers);
   launch_workers(mt_info, num_workers);
-  sync_enc_workers(mt_info, cm, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
 }
 
 // Computes num_workers for temporal filter multi-threading.
-static AOM_INLINE int compute_num_tf_workers(AV1_COMP *cpi) {
+static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
   // For single-pass encode, using no. of workers as per tf block size was not
   // found to improve speed. Hence the thread assignment for single-pass encode
   // is kept based on compute_num_enc_workers().
@@ -3058,11 +3413,10 @@
     case MOD_AI:
       if (cpi->oxcf.pass == AOM_RC_ONE_PASS) {
         num_mod_workers = compute_num_ai_workers(cpi);
-        break;
       } else {
         num_mod_workers = 0;
-        break;
       }
+      break;
     default: assert(0); break;
   }
   return (num_mod_workers);
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 6c4bce4..4f4c232 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -23,6 +23,7 @@
   struct AV1_COMP *cpi;
   struct ThreadData *td;
   struct ThreadData *original_td;
+  struct aom_internal_error_info error_info;
   AV1LfSync *lf_sync;
   LFWorkerData *lf_data;
   int start;
@@ -52,9 +53,9 @@
 
 void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
 
-void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync);
 
-void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data);
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
 
 #if !CONFIG_REALTIME_ONLY
 void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
@@ -99,13 +100,12 @@
 void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
 #endif  // CONFIG_MULTITHREAD
 
-int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
                                       MULTI_THREADED_MODULES mod_name);
 
 void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
 
-void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
-                                CdefSearchCtx *cdef_search_ctx);
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi);
 
 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
 
@@ -116,7 +116,7 @@
     unsigned int *max_tile_size, uint32_t *const obu_header_size,
     uint8_t **tile_data_start, const int num_workers);
 
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
 
 int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
 
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 1fad149..7ddc3e3 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -33,6 +33,7 @@
 #include "av1/encoder/block.h"
 #include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
@@ -259,6 +260,35 @@
   return sr;
 }
 
+static AOM_INLINE const search_site_config *
+av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      SEARCH_METHODS search_method) {
+  const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+  // For AVIF applications, even the source frames can have changing resolution,
+  // so we need to manually check for the strides :(
+  // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+  // that's shared by multiple threads. In most cases where all frames have the
+  // same resolution, the cache contains the search site config that we need.
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+  if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) {
+    return mv_search_params->search_site_cfg[SS_CFG_FPF];
+  }
+
+  // If the cache does not contain the correct stride, then we will need to rely
+  // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+  // thread level config doesn't match, then we need to update it.
+  search_method = search_method_lookup[search_method];
+  assert(search_method_lookup[search_method] == search_method &&
+         "The search_method_lookup table should be idempotent.");
+  if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+    av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+                                   ref_stride);
+  }
+
+  return x->search_site_cfg_buf;
+}
+
 static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
                                                 const MV *ref_mv,
                                                 FULLPEL_MV *best_mv,
@@ -272,18 +302,18 @@
   const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
 
   const search_site_config *first_pass_search_sites =
-      cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+      av1_get_first_pass_search_site_config(cpi, x, NSTEP);
   const int fine_search_interval =
       cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
   FULLPEL_MOTION_SEARCH_PARAMS ms_params;
   av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
-                                     start_mv, first_pass_search_sites,
+                                     start_mv, first_pass_search_sites, NSTEP,
                                      fine_search_interval);
-  av1_set_mv_search_method(&ms_params, first_pass_search_sites, NSTEP);
 
   FULLPEL_MV this_best_mv;
+  FULLPEL_MV_STATS best_mv_stats;
   tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
-                                  &this_best_mv, NULL);
+                                  &this_best_mv, &best_mv_stats, NULL);
 
   if (tmp_err < INT_MAX) {
     aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
@@ -744,9 +774,8 @@
   if ((current_frame->frame_number > 1) && golden_frame != NULL) {
     FULLPEL_MV tmp_mv = kZeroFullMv;
     // Assume 0,0 motion with no mv overhead.
-    xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
-    xd->plane[0].pre[0].stride = golden_frame->y_stride;
-    xd->plane[0].pre[0].width = golden_frame->y_width;
+    av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1);
+    xd->plane[0].pre[0].buf += recon_yoffset;
     gf_motion_error =
         get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
                                       &x->plane[0].src, &xd->plane[0].pre[0]);
@@ -1032,9 +1061,11 @@
   }
 }
 
-static void free_firstpass_data(FirstPassData *firstpass_data) {
+void av1_free_firstpass_data(FirstPassData *firstpass_data) {
   aom_free(firstpass_data->raw_motion_err_list);
+  firstpass_data->raw_motion_err_list = NULL;
   aom_free(firstpass_data->mb_stats);
+  firstpass_data->mb_stats = NULL;
 }
 
 int av1_get_unit_rows_in_tile(const TileInfo *tile,
@@ -1073,17 +1104,7 @@
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
-  const int num_planes = av1_num_planes(&cpi->common);
-  for (int plane = 0; plane < num_planes; plane++) {
-    const int subsampling_xy =
-        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
-              : 0;
-    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
-    CHECK_MEM_ERROR(
-        cm, cpi->td.mb.plane[plane].src_diff,
-        (int16_t *)aom_memalign(
-            32, sizeof(*cpi->td.mb.plane[plane].src_diff) * sb_size));
-  }
+
   for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
       TileDataEnc *const tile_data =
@@ -1091,12 +1112,6 @@
       first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
     }
   }
-  for (int plane = 0; plane < num_planes; plane++) {
-    if (cpi->td.mb.plane[plane].src_diff) {
-      aom_free(cpi->td.mb.plane[plane].src_diff);
-      cpi->td.mb.plane[plane].src_diff = NULL;
-    }
-  }
 }
 
 void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
@@ -1187,6 +1202,16 @@
 
     enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile);
 
+#if CONFIG_MULTITHREAD
+    if (cpi->ppi->p_mt_info.num_workers > 1) {
+      pthread_mutex_lock(enc_row_mt->mutex_);
+      bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+      pthread_mutex_unlock(enc_row_mt->mutex_);
+      // Exit in case any worker has encountered an error.
+      if (firstpass_mt_exit) return;
+    }
+#endif
+
     if (unit_col_in_tile == 0) {
       last_mv = *first_top_mv;
     }
@@ -1246,7 +1271,7 @@
   setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
   FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
   FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
-  free_firstpass_data(&cpi->firstpass_data);
+  av1_free_firstpass_data(&cpi->firstpass_data);
   update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number,
                          ts_duration, BLOCK_16X16);
 }
@@ -1365,6 +1390,7 @@
   av1_init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
   av1_initialize_rd_consts(cpi);
+  av1_alloc_src_diff_buf(cm, &cpi->td.mb);
 
   enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
   enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
@@ -1382,7 +1408,8 @@
       frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols;
   const double raw_err_stdev =
       raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
-  free_firstpass_data(&cpi->firstpass_data);
+  av1_free_firstpass_data(&cpi->firstpass_data);
+  av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm));
 
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index e18e9e4..d01363a 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -568,6 +568,8 @@
                         const BLOCK_SIZE fp_block_size);
 void av1_end_first_pass(struct AV1_COMP *cpi);
 
+void av1_free_firstpass_data(FirstPassData *firstpass_data);
+
 void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
 void av1_accumulate_stats(FIRSTPASS_STATS *section,
                           const FIRSTPASS_STATS *frame);
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index bc5e186..73910de 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -30,6 +30,83 @@
 // Border over which to compute the global motion
 #define ERRORADV_BORDER 0
 
+/* clang-format off */
+// Error metric used for global motion evaluation.
+// For 8-bit input, the pixel error used to index this table will always
+// be between -255 and +255. But for 10- and 12-bit input, we use interpolation
+// which means that we need to support indices of -256 and +256 as well.
+// Therefore, the table is offset so that logical index 0 corresponds to
+// error_measure_lut[256].
+const int error_measure_lut[513] = {
+    // pow 0.7
+    16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113,
+    16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749,
+    15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381,
+    15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010,
+    14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634,
+    14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254,
+    14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870,
+    13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481,
+    13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088,
+    13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689,
+    12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285,
+    12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875,
+    11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458,
+    11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036,
+    10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606,
+    10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168,
+    10113, 10058, 10002,  9947,  9891,  9835,  9779,  9723,
+     9666,  9610,  9553,  9497,  9440,  9383,  9326,  9268,
+     9211,  9153,  9095,  9037,  8979,  8921,  8862,  8804,
+     8745,  8686,  8627,  8568,  8508,  8449,  8389,  8329,
+     8269,  8208,  8148,  8087,  8026,  7965,  7903,  7842,
+     7780,  7718,  7656,  7593,  7531,  7468,  7405,  7341,
+     7278,  7214,  7150,  7086,  7021,  6956,  6891,  6826,
+     6760,  6695,  6628,  6562,  6495,  6428,  6361,  6293,
+     6225,  6157,  6089,  6020,  5950,  5881,  5811,  5741,
+     5670,  5599,  5527,  5456,  5383,  5311,  5237,  5164,
+     5090,  5015,  4941,  4865,  4789,  4713,  4636,  4558,
+     4480,  4401,  4322,  4242,  4162,  4080,  3998,  3916,
+     3832,  3748,  3663,  3577,  3490,  3402,  3314,  3224,
+     3133,  3041,  2948,  2854,  2758,  2661,  2562,  2461,
+     2359,  2255,  2148,  2040,  1929,  1815,  1698,  1577,
+     1452,  1323,  1187,  1045,   894,   731,   550,   339,
+        0,   339,   550,   731,   894,  1045,  1187,  1323,
+     1452,  1577,  1698,  1815,  1929,  2040,  2148,  2255,
+     2359,  2461,  2562,  2661,  2758,  2854,  2948,  3041,
+     3133,  3224,  3314,  3402,  3490,  3577,  3663,  3748,
+     3832,  3916,  3998,  4080,  4162,  4242,  4322,  4401,
+     4480,  4558,  4636,  4713,  4789,  4865,  4941,  5015,
+     5090,  5164,  5237,  5311,  5383,  5456,  5527,  5599,
+     5670,  5741,  5811,  5881,  5950,  6020,  6089,  6157,
+     6225,  6293,  6361,  6428,  6495,  6562,  6628,  6695,
+     6760,  6826,  6891,  6956,  7021,  7086,  7150,  7214,
+     7278,  7341,  7405,  7468,  7531,  7593,  7656,  7718,
+     7780,  7842,  7903,  7965,  8026,  8087,  8148,  8208,
+     8269,  8329,  8389,  8449,  8508,  8568,  8627,  8686,
+     8745,  8804,  8862,  8921,  8979,  9037,  9095,  9153,
+     9211,  9268,  9326,  9383,  9440,  9497,  9553,  9610,
+     9666,  9723,  9779,  9835,  9891,  9947, 10002, 10058,
+    10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
+    10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
+    10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
+    11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
+    11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
+    12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
+    12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
+    13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
+    13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
+    13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
+    14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
+    14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
+    14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
+    15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
+    15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
+    16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
+    16384,
+};
+/* clang-format on */
+
 int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
   return best_erroradvantage < erroradv_tr &&
          best_erroradvantage * params_cost < erroradv_prod_tr;
@@ -110,15 +187,76 @@
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-static int64_t highbd_warp_error(
-    WarpedMotionParams *wm, const uint16_t *const ref, int width, int height,
-    int stride, const uint16_t *const dst, int p_col, int p_row, int p_width,
-    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
-    int64_t best_error, uint8_t *segment_map, int segment_map_stride) {
+static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+                                     const uint16_t *const dst, int dst_stride,
+                                     int p_width, int p_height) {
+  // This function should only be called for patches smaller than
+  // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+  // small enough that we don't need a 64-bit accumulator
+  assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+  int sad = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+    }
+  }
+  return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_segmented_frame_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t highbd_segmented_frame_error(
+    const uint16_t *const ref, int ref_stride, const uint16_t *const dst,
+    int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map,
+    int segment_map_stride) {
+  (void)bd;
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+      if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+        sum_error += aom_highbd_sad32x32(
+            CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride,
+            CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+      } else {
+        sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride,
+                                        dst + j + i * dst_stride, dst_stride,
+                                        patch_w, patch_h);
+      }
+    }
+  }
+  return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t highbd_warp_error(WarpedMotionParams *wm,
+                                 const uint16_t *const ref, int ref_width,
+                                 int ref_height, int ref_stride,
+                                 const uint16_t *const dst, int dst_stride,
+                                 int p_col, int p_row, int p_width,
+                                 int p_height, int subsampling_x,
+                                 int subsampling_y, int bd, int64_t best_error,
+                                 uint8_t *segment_map, int segment_map_stride) {
   int64_t gm_sumerr = 0;
   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+  DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
 
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
   conv_params.use_dist_wtd_comp_avg = 0;
@@ -131,14 +269,22 @@
       if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
       // avoid warping extra 8x8 blocks in the padded region of the frame
       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      highbd_warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w,
-                        warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
-                        bd, &conv_params);
-      gm_sumerr += av1_calc_highbd_frame_error(tmp, WARP_ERROR_BLOCK,
-                                               dst + j + i * p_stride, warp_w,
-                                               warp_h, p_stride, bd);
+      const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+      highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i,
+                        warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x,
+                        subsampling_y, bd, &conv_params);
+
+      if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+        gm_sumerr += aom_highbd_sad32x32(
+            CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK,
+            CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+      } else {
+        gm_sumerr +=
+            generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+                               dst_stride, warp_w, warp_h);
+      }
+
       if (gm_sumerr > best_error) return INT64_MAX;
     }
   }
@@ -146,10 +292,67 @@
 }
 #endif
 
+static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+                              const uint8_t *const dst, int dst_stride,
+                              int p_width, int p_height) {
+  // This function should only be called for patches smaller than
+  // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+  // small enough that we don't need a 64-bit accumulator
+  assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+  int sad = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+    }
+  }
+  return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in segmented_warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
+static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride,
+                                     const uint8_t *const dst, int dst_stride,
+                                     int p_width, int p_height,
+                                     uint8_t *segment_map,
+                                     int segment_map_stride) {
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+      if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+        sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride,
+                                  dst + j + i * dst_stride, dst_stride);
+      } else {
+        sum_error +=
+            generic_sad(ref + j + i * ref_stride, ref_stride,
+                        dst + j + i * dst_stride, dst_stride, patch_w, patch_h);
+      }
+    }
+  }
+  return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
-                          int width, int height, int stride,
-                          const uint8_t *const dst, int p_col, int p_row,
-                          int p_width, int p_height, int p_stride,
+                          int ref_width, int ref_height, int ref_stride,
+                          const uint8_t *const dst, int dst_stride, int p_col,
+                          int p_row, int p_width, int p_height,
                           int subsampling_x, int subsampling_y,
                           int64_t best_error, uint8_t *segment_map,
                           int segment_map_stride) {
@@ -170,62 +373,72 @@
       if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
       // avoid warping extra 8x8 blocks in the padded region of the frame
       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
-                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
+      warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+      warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w,
+                 warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+                 &conv_params);
 
-      gm_sumerr +=
-          av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
-                               warp_w, warp_h, p_stride);
+      if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+        gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK,
+                                  dst + j + i * dst_stride, dst_stride);
+      } else {
+        gm_sumerr +=
+            generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+                        dst_stride, warp_w, warp_h);
+      }
+
       if (gm_sumerr > best_error) return INT64_MAX;
     }
   }
   return gm_sumerr;
 }
 
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error,
-                       uint8_t *segment_map, int segment_map_stride) {
-  force_wmtype(wm, wm->wmtype);
-  assert(wm->wmtype <= AFFINE);
-  if (!av1_get_shear_params(wm)) return INT64_MAX;
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int ref_stride, uint8_t *dst, int dst_stride,
+                                  int p_width, int p_height,
+                                  uint8_t *segment_map,
+                                  int segment_map_stride) {
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (use_hbd)
-    return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height,
-                             stride, CONVERT_TO_SHORTPTR(dst), p_col, p_row,
-                             p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, bd, best_error, segment_map,
-                             segment_map_stride);
+  if (use_hbd) {
+    return highbd_segmented_frame_error(
+        CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst),
+        dst_stride, p_width, p_height, bd, segment_map, segment_map_stride);
+  }
 #endif
   (void)use_hbd;
   (void)bd;
-  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
-                    p_height, p_stride, subsampling_x, subsampling_y,
-                    best_error, segment_map, segment_map_stride);
+  return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width,
+                               p_height, segment_map, segment_map_stride);
 }
 
-// Factors used to calculate the thresholds for av1_warp_error
-static double thresh_factors[GM_MAX_REFINEMENT_STEPS] = { 1.25, 1.20, 1.15,
-                                                          1.10, 1.05 };
-
-static INLINE int64_t calc_approx_erroradv_threshold(
-    double scaling_factor, int64_t erroradv_threshold) {
-  return erroradv_threshold <
-                 (int64_t)(((double)INT64_MAX / scaling_factor) + 0.5)
-             ? (int64_t)(scaling_factor * erroradv_threshold + 0.5)
-             : INT64_MAX;
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int ref_width, int ref_height,
+                       int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+                       int p_row, int p_width, int p_height, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride) {
+  if (!av1_get_shear_params(wm)) return INT64_MAX;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd)
+    return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width,
+                             ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst),
+                             dst_stride, p_col, p_row, p_width, p_height,
+                             subsampling_x, subsampling_y, bd, best_error,
+                             segment_map, segment_map_stride);
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride,
+                    p_col, p_row, p_width, p_height, subsampling_x,
+                    subsampling_y, best_error, segment_map, segment_map_stride);
 }
 
 int64_t av1_refine_integerized_param(
     WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
     uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
     int d_width, int d_height, int d_stride, int n_refinements,
-    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
-    int64_t erroradv_threshold) {
+    int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) {
   static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
   const int border = ERRORADV_BORDER;
   int i = 0, p;
@@ -238,36 +451,51 @@
   int32_t best_param;
 
   force_wmtype(wm, wmtype);
-  best_error =
-      av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
-                     dst + border * d_stride + border, border, border,
-                     d_width - 2 * border, d_height - 2 * border, d_stride, 0,
-                     0, best_frame_error, segment_map, segment_map_stride);
+  wm->wmtype = get_wmtype(wm);
 
   if (n_refinements == 0) {
-    wm->wmtype = get_wmtype(wm);
-    return best_error;
+    // Compute the maximum error value that will be accepted, so that
+    // av1_warp_error can terminate early if it proves the model will not
+    // be accepted.
+    int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr);
+    return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                          dst + border * d_stride + border, d_stride, border,
+                          border, d_width - 2 * border, d_height - 2 * border,
+                          0, 0, selection_threshold, segment_map,
+                          segment_map_stride);
   }
 
-  best_error = AOMMIN(best_error, best_frame_error);
+  // When refining, use a slightly higher threshold for the initial error
+  // calculation - see comment above erroradv_early_tr for why.
+  int64_t selection_threshold =
+      (int64_t)lrint(ref_frame_error * erroradv_early_tr);
+  best_error =
+      av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                     dst + border * d_stride + border, d_stride, border, border,
+                     d_width - 2 * border, d_height - 2 * border, 0, 0,
+                     selection_threshold, segment_map, segment_map_stride);
+
+  if (best_error > selection_threshold) {
+    return INT64_MAX;
+  }
+
   step = 1 << (n_refinements - 1);
   for (i = 0; i < n_refinements; i++, step >>= 1) {
-    int64_t error_adv_thresh =
-        calc_approx_erroradv_threshold(thresh_factors[i], erroradv_threshold);
     for (p = 0; p < n_params; ++p) {
       int step_dir = 0;
-      // Skip searches for parameters that are forced to be 0
       param = param_mat + p;
       curr_param = *param;
       best_param = curr_param;
       // look to the left
+      // Note: We have to use force_wmtype() to keep the proper symmetry for
+      // ROTZOOM type models
       *param = add_param_offset(p, curr_param, -step);
+      force_wmtype(wm, wmtype);
       step_error =
           av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
-                         dst + border * d_stride + border, border, border,
-                         d_width - 2 * border, d_height - 2 * border, d_stride,
-                         0, 0, AOMMIN(best_error, error_adv_thresh),
-                         segment_map, segment_map_stride);
+                         dst + border * d_stride + border, d_stride, border,
+                         border, d_width - 2 * border, d_height - 2 * border, 0,
+                         0, best_error, segment_map, segment_map_stride);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -276,40 +504,42 @@
 
       // look to the right
       *param = add_param_offset(p, curr_param, step);
+      force_wmtype(wm, wmtype);
       step_error =
           av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
-                         dst + border * d_stride + border, border, border,
-                         d_width - 2 * border, d_height - 2 * border, d_stride,
-                         0, 0, AOMMIN(best_error, error_adv_thresh),
-                         segment_map, segment_map_stride);
+                         dst + border * d_stride + border, d_stride, border,
+                         border, d_width - 2 * border, d_height - 2 * border, 0,
+                         0, best_error, segment_map, segment_map_stride);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
         step_dir = 1;
       }
-      *param = best_param;
 
       // look to the direction chosen above repeatedly until error increases
       // for the biggest step size
       while (step_dir) {
         *param = add_param_offset(p, best_param, step * step_dir);
+        force_wmtype(wm, wmtype);
         step_error =
             av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
-                           dst + border * d_stride + border, border, border,
-                           d_width - 2 * border, d_height - 2 * border,
-                           d_stride, 0, 0, AOMMIN(best_error, error_adv_thresh),
-                           segment_map, segment_map_stride);
+                           dst + border * d_stride + border, d_stride, border,
+                           border, d_width - 2 * border, d_height - 2 * border,
+                           0, 0, best_error, segment_map, segment_map_stride);
         if (step_error < best_error) {
           best_error = step_error;
           best_param = *param;
         } else {
-          *param = best_param;
           step_dir = 0;
         }
       }
+
+      // Restore best parameter value so far
+      *param = best_param;
+      force_wmtype(wm, wmtype);
     }
   }
-  force_wmtype(wm, wmtype);
+
   wm->wmtype = get_wmtype(wm);
   return best_error;
 }
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index cf1d0fd..8c9c60f 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -40,7 +40,7 @@
 
   // Pointer to hold inliers from motion model.
   uint8_t *segment_map;
-} GlobalMotionThreadData;
+} GlobalMotionData;
 
 typedef struct {
   // Holds the mapping of each thread to past/future direction.
@@ -63,43 +63,82 @@
   // Data related to assigning jobs for global motion multi-threading.
   JobInfo job_info;
 
-  // Data specific to each worker in global motion multi-threading.
-  // thread_data[i] stores the thread specific data for worker 'i'.
-  GlobalMotionThreadData *thread_data;
-
 #if CONFIG_MULTITHREAD
   // Mutex lock used while dispatching jobs.
   pthread_mutex_t *mutex_;
 #endif
 
-  // Width and height for which segment_map is allocated for each thread.
-  int allocated_width;
-  int allocated_height;
-
-  // Number of workers for which thread_data is allocated.
-  int8_t allocated_workers;
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool gm_mt_exit;
 } AV1GlobalMotionSync;
 
 void av1_convert_model_to_params(const double *params,
                                  WarpedMotionParams *model);
 
-// TODO(sarahparker) These need to be retuned for speed 0 and 1 to
-// maximize gains from segmented error metric
+// Criteria for accepting a global motion model
 static const double erroradv_tr = 0.65;
 static const double erroradv_prod_tr = 20000;
 
+// Early exit threshold for global motion refinement
+// This is set slightly higher than erroradv_tr, as a compromise between
+// two factors:
+//
+// 1) By rejecting un-promising models early, we can reduce the encode time
+//    spent trying to refine them
+//
+// 2) When we refine a model, its error may decrease to below the acceptance
+//    threshold even if the model is initially above the threshold
+static const double erroradv_early_tr = 0.70;
+
 int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
 
 void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
                                           int height, int *inliers,
                                           int num_inliers);
 
+extern const int error_measure_lut[513];
+
+static INLINE int error_measure(int err) {
+  return error_measure_lut[256 + err];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_error_measure(int err, int bd) {
+  const int b = bd - 8;
+  const int bmask = (1 << b) - 1;
+  const int v = (1 << b);
+
+  // Split error into two parts and do an interpolated table lookup
+  // To compute the table index and interpolation value, we want to calculate
+  // the quotient and remainder of err / 2^b. But it is very important that
+  // the division must round down, and the remainder must be positive,
+  // ie. in the range [0, 2^b).
+  //
+  // In C, the >> and & operators do what we want, but the / and % operators
+  // give the wrong results for negative inputs. So we must use >> and & here.
+  //
+  // For example, if bd == 10 and err == -5, compare the results:
+  //       (-5) >> 2 = -2, (-5) & 3 =  3
+  //   vs. (-5) / 4  = -1, (-5) % 4 = -1
+  const int e1 = err >> b;
+  const int e2 = err & bmask;
+  return error_measure_lut[256 + e1] * (v - e2) +
+         error_measure_lut[257 + e1] * e2;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int ref_stride, uint8_t *dst, int dst_stride,
+                                  int p_width, int p_height,
+                                  uint8_t *segment_map, int segment_map_stride);
+
 // Returns the error between the result of applying motion 'wm' to the frame
 // described by 'ref' and the frame described by 'dst'.
 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
+                       const uint8_t *ref, int ref_width, int ref_height,
+                       int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+                       int p_row, int p_width, int p_height, int subsampling_x,
                        int subsampling_y, int64_t best_error,
                        uint8_t *segment_map, int segment_map_stride);
 
@@ -110,8 +149,7 @@
     WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
     uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
     int d_width, int d_height, int d_stride, int n_refinements,
-    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
-    int64_t erroradv_threshold);
+    int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index 1a00cbb..02a4e70 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c
@@ -20,8 +20,9 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/global_motion_facade.h"
 
-// Highest motion model to search.
-#define GLOBAL_TRANS_TYPES_ENC 3
+// Range of model types to search
+#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM
+#define LAST_GLOBAL_TRANS_TYPE ROTZOOM
 
 // Computes the cost for the warp parameters.
 static int gm_get_params_cost(const WarpedMotionParams *gm,
@@ -73,47 +74,46 @@
   return (params_cost << AV1_PROB_COST_SHIFT);
 }
 
-// Calculates the threshold to be used for warp error computation.
-static AOM_INLINE int64_t calc_erroradv_threshold(int64_t ref_frame_error) {
-  return (int64_t)(ref_frame_error * erroradv_tr + 0.5);
-}
-
 // For the given reference frame, computes the global motion parameters for
 // different motion models and finds the best.
 static AOM_INLINE void compute_global_motion_for_ref_frame(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
     MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
     const int segment_map_h, const WarpedMotionParams *ref_params) {
-  ThreadData *const td = &cpi->td;
-  MACROBLOCK *const x = &td->mb;
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int i;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int src_width = cpi->source->y_crop_width;
   int src_height = cpi->source->y_crop_height;
   int src_stride = cpi->source->y_stride;
-  WarpedMotionParams tmp_wm_params;
-  const double *params_this_motion;
   assert(ref_buf[frame] != NULL);
-  TransformationType model;
   int bit_depth = cpi->common.seq_params->bit_depth;
   GlobalMotionMethod global_motion_method = default_global_motion_method;
   int num_refinements = cpi->sf.gm_sf.num_refinement_steps;
+  bool mem_alloc_failed = false;
 
-  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
-    if (!aom_compute_global_motion(model, cpi->source, ref_buf[frame],
-                                   bit_depth, global_motion_method,
-                                   motion_models, RANSAC_NUM_MOTIONS)) {
+  // Select the best model based on fractional error reduction.
+  // By initializing this to erroradv_tr, the same logic which is used to
+  // select the best model will automatically filter out any model which
+  // doesn't meet the required quality threshold
+  double best_erroradv = erroradv_tr;
+  for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE;
+       model <= LAST_GLOBAL_TRANS_TYPE; ++model) {
+    if (!aom_compute_global_motion(
+            model, cpi->source, ref_buf[frame], bit_depth, global_motion_method,
+            motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) {
+      if (mem_alloc_failed) {
+        aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate global motion buffers");
+      }
       continue;
     }
 
-    int64_t best_ref_frame_error = 0;
-    int64_t best_warp_error = INT64_MAX;
-    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+    for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
       if (motion_models[i].num_inliers == 0) continue;
 
-      params_this_motion = motion_models[i].params;
-      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
+      WarpedMotionParams tmp_wm_params;
+      av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params);
 
       // Skip models that we won't use (IDENTITY or TRANSLATION)
       //
@@ -133,29 +133,26 @@
 
       int64_t ref_frame_error = av1_segmented_frame_error(
           is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
-          ref_buf[frame]->y_stride, cpi->source->y_buffer, src_width,
-          src_height, src_stride, segment_map, segment_map_w);
+          ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride,
+          src_width, src_height, segment_map, segment_map_w);
 
       if (ref_frame_error == 0) continue;
 
-      const int64_t erroradv_threshold =
-          calc_erroradv_threshold(ref_frame_error);
-
       const int64_t warp_error = av1_refine_integerized_param(
           &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
           ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width,
           ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride,
           cpi->source->y_buffer, src_width, src_height, src_stride,
-          num_refinements, best_warp_error, segment_map, segment_map_w,
-          erroradv_threshold);
+          num_refinements, ref_frame_error, segment_map, segment_map_w);
 
       // av1_refine_integerized_param() can return a simpler model type than
       // its input, so re-check model type here
       if (tmp_wm_params.wmtype <= TRANSLATION) continue;
 
-      if (warp_error < best_warp_error) {
-        best_ref_frame_error = ref_frame_error;
-        best_warp_error = warp_error;
+      double erroradvantage = (double)warp_error / ref_frame_error;
+
+      if (erroradvantage < best_erroradv) {
+        best_erroradv = erroradvantage;
         // Save the wm_params modified by
         // av1_refine_integerized_param() rather than motion index to
         // avoid rerunning refine() below.
@@ -163,47 +160,41 @@
                sizeof(WarpedMotionParams));
       }
     }
-    assert(cm->global_motion[frame].wmtype <= AFFINE);
-    if (!av1_get_shear_params(&cm->global_motion[frame]))
-      cm->global_motion[frame] = default_warp_params;
+  }
+
+  if (!av1_get_shear_params(&cm->global_motion[frame]))
+    cm->global_motion[frame] = default_warp_params;
 
 #if 0
-    // We never choose translational models, so this code is disabled
-    if (cm->global_motion[frame].wmtype == TRANSLATION) {
-      cm->global_motion[frame].wmmat[0] =
-          convert_to_trans_prec(cm->features.allow_high_precision_mv,
-                                cm->global_motion[frame].wmmat[0]) *
-          GM_TRANS_ONLY_DECODE_FACTOR;
-      cm->global_motion[frame].wmmat[1] =
-          convert_to_trans_prec(cm->features.allow_high_precision_mv,
-                                cm->global_motion[frame].wmmat[1]) *
-          GM_TRANS_ONLY_DECODE_FACTOR;
-    }
+  // We never choose translational models, so this code is disabled
+  if (cm->global_motion[frame].wmtype == TRANSLATION) {
+    cm->global_motion[frame].wmmat[0] =
+        convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                              cm->global_motion[frame].wmmat[0]) *
+        GM_TRANS_ONLY_DECODE_FACTOR;
+    cm->global_motion[frame].wmmat[1] =
+        convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                              cm->global_motion[frame].wmmat[1]) *
+        GM_TRANS_ONLY_DECODE_FACTOR;
+  }
 #endif
 
-    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
+  if (cm->global_motion[frame].wmtype == IDENTITY) return;
 
-    // Once we get here, best_ref_frame_error must be > 0. This is because
-    // of the logic above, which skips  over any models which have
-    // ref_frame_error == 0
-    assert(best_ref_frame_error > 0);
-
-    // If the best error advantage found doesn't meet the threshold for
-    // this motion type, revert to IDENTITY.
-    if (!av1_is_enough_erroradvantage(
-            (double)best_warp_error / best_ref_frame_error,
-            gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                               cm->features.allow_high_precision_mv))) {
-      cm->global_motion[frame] = default_warp_params;
-    }
-
-    if (cm->global_motion[frame].wmtype != IDENTITY) break;
+  // If the best error advantage found doesn't meet the threshold for
+  // this motion type, revert to IDENTITY.
+  if (!av1_is_enough_erroradvantage(
+          best_erroradv,
+          gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                             cm->features.allow_high_precision_mv))) {
+    cm->global_motion[frame] = default_warp_params;
   }
 }
 
 // Computes global motion for the given reference frame.
 void av1_compute_gm_for_valid_ref_frames(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
     MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
     int segment_map_h) {
   AV1_COMMON *const cm = &cpi->common;
@@ -211,9 +202,9 @@
       cm->prev_frame ? &cm->prev_frame->global_motion[frame]
                      : &default_warp_params;
 
-  compute_global_motion_for_ref_frame(cpi, ref_buf, frame, motion_models,
-                                      segment_map, segment_map_w, segment_map_h,
-                                      ref_params);
+  compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame,
+                                      motion_models, segment_map, segment_map_w,
+                                      segment_map_h, ref_params);
 }
 
 // Loops over valid reference frames and computes global motion estimation.
@@ -223,13 +214,15 @@
     MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
     const int segment_map_h) {
   AV1_COMMON *const cm = &cpi->common;
+  struct aom_internal_error_info *const error_info =
+      cpi->td.mb.e_mbd.error_info;
   // Compute global motion w.r.t. reference frames starting from the nearest ref
   // frame in a given direction.
   for (int frame = 0; frame < num_ref_frames; frame++) {
     int ref_frame = reference_frame[frame].frame;
-    av1_compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, motion_models,
-                                        segment_map, segment_map_w,
-                                        segment_map_h);
+    av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame,
+                                        motion_models, segment_map,
+                                        segment_map_w, segment_map_h);
     // If global motion w.r.t. current ref frame is
     // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
     // the remaining ref frames in that direction.
@@ -361,40 +354,6 @@
   }
 }
 
-// Deallocates segment_map and inliers.
-static AOM_INLINE void dealloc_global_motion_data(MotionModel *motion_models,
-                                                  uint8_t *segment_map) {
-  aom_free(segment_map);
-
-  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-    aom_free(motion_models[m].inliers);
-  }
-}
-
-// Allocates and initializes memory for segment_map and MotionModel.
-static AOM_INLINE bool alloc_global_motion_data(MotionModel *motion_models,
-                                                uint8_t **segment_map,
-                                                const int segment_map_w,
-                                                const int segment_map_h) {
-  av1_zero_array(motion_models, RANSAC_NUM_MOTIONS);
-  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-    motion_models[m].inliers =
-        aom_malloc(sizeof(*(motion_models[m].inliers)) * 2 * MAX_CORNERS);
-    if (!motion_models[m].inliers) {
-      dealloc_global_motion_data(motion_models, NULL);
-      return false;
-    }
-  }
-
-  *segment_map = (uint8_t *)aom_calloc(segment_map_w * segment_map_h,
-                                       sizeof(*segment_map));
-  if (!*segment_map) {
-    dealloc_global_motion_data(motion_models, NULL);
-    return false;
-  }
-  return true;
-}
-
 // Initializes parameters used for computing global motion.
 static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
   GlobalMotionInfo *const gm_info = &cpi->gm_info;
@@ -439,11 +398,7 @@
 // Computes global motion w.r.t. valid reference frames.
 static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
   GlobalMotionInfo *const gm_info = &cpi->gm_info;
-  MotionModel motion_models[RANSAC_NUM_MOTIONS];
-  uint8_t *segment_map = NULL;
-
-  alloc_global_motion_data(motion_models, &segment_map, gm_info->segment_map_w,
-                           gm_info->segment_map_h);
+  GlobalMotionData *gm_data = &cpi->td.gm_data;
 
   // Compute global motion w.r.t. past reference frames and future reference
   // frames
@@ -451,11 +406,9 @@
     if (gm_info->num_ref_frames[dir] > 0)
       compute_global_motion_for_references(
           cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
-          gm_info->num_ref_frames[dir], motion_models, segment_map,
-          gm_info->segment_map_w, gm_info->segment_map_h);
+          gm_info->num_ref_frames[dir], gm_data->motion_models,
+          gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
   }
-
-  dealloc_global_motion_data(motion_models, segment_map);
 }
 
 // Global motion estimation for the current frame is computed.This computation
@@ -478,13 +431,19 @@
   }
 
   if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
-      cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done) {
+      cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done &&
+      cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) {
     setup_global_motion_info_params(cpi);
-    if (cpi->mt_info.num_workers > 1)
-      av1_global_motion_estimation_mt(cpi);
-    else
-      global_motion_estimation(cpi);
-    gm_info->search_done = 1;
+    // Terminate early if the total number of reference frames is zero.
+    if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) {
+      gm_alloc_data(cpi, &cpi->td.gm_data);
+      if (cpi->mt_info.num_workers > 1)
+        av1_global_motion_estimation_mt(cpi);
+      else
+        global_motion_estimation(cpi);
+      gm_dealloc_data(&cpi->td.gm_data);
+      gm_info->search_done = 1;
+    }
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
          sizeof(cm->cur_frame->global_motion));
diff --git a/av1/encoder/global_motion_facade.h b/av1/encoder/global_motion_facade.h
index dfdedf7..f13989aa 100644
--- a/av1/encoder/global_motion_facade.h
+++ b/av1/encoder/global_motion_facade.h
@@ -18,8 +18,36 @@
 struct yv12_buffer_config;
 struct AV1_COMP;
 
+// Allocates memory for members of GlobalMotionData.
+static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
+  AV1_COMMON *cm = &cpi->common;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+  CHECK_MEM_ERROR(cm, gm_data->segment_map,
+                  aom_malloc(sizeof(*gm_data->segment_map) *
+                             gm_info->segment_map_w * gm_info->segment_map_h));
+
+  av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS);
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers,
+                    aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 *
+                               MAX_CORNERS));
+  }
+}
+
+// Deallocates the memory allocated for members of GlobalMotionData.
+static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) {
+  aom_free(gm_data->segment_map);
+  gm_data->segment_map = NULL;
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    aom_free(gm_data->motion_models[m].inliers);
+    gm_data->motion_models[m].inliers = NULL;
+  }
+}
+
 void av1_compute_gm_for_valid_ref_frames(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
     MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
     int segment_map_h);
 void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 3091037..8037b59 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -10,6 +10,7 @@
  */
 
 #include "av1/encoder/hash.h"
+#include "config/av1_rtcd.h"
 
 static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
                                         uint8_t *pData, uint32_t dataLength) {
diff --git a/av1/encoder/hash_motion.c b/av1/encoder/hash_motion.c
index 164aa09..8b04e22 100644
--- a/av1/encoder/hash_motion.c
+++ b/av1/encoder/hash_motion.c
@@ -128,7 +128,7 @@
   }
   p_hash_table->p_lookup_table =
       (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0]));
-  if (!p_hash_table) return false;
+  if (!p_hash_table->p_lookup_table) return false;
   return true;
 }
 
@@ -141,13 +141,16 @@
     if (p_hash_table->p_lookup_table[hash_value] == NULL) {
       return false;
     }
-    aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
-                     sizeof(curr_block_hash[0]));
-    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
-                         curr_block_hash);
+    if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                         sizeof(curr_block_hash[0])) == VECTOR_ERROR)
+      return false;
+    if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                             curr_block_hash) == VECTOR_ERROR)
+      return false;
   } else {
-    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
-                         curr_block_hash);
+    if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                             curr_block_hash) == VECTOR_ERROR)
+      return false;
   }
   return true;
 }
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 4c2f8d0..a108e81 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -312,17 +312,51 @@
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+                                       ptrdiff_t src_stride,
+                                       tran_low_t *coeff) {
+  switch (tx_size) {
+    // As the output transform co-efficients of 4x4 Hadamard transform can be
+    // represented using 15 bits (for 12-bit clip) use lowbd variant of
+    // hadamard_4x4.
+    case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+    case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break;
+    case TX_16X16:
+      aom_highbd_hadamard_16x16(src_diff, src_stride, coeff);
+      break;
+    case TX_32X32:
+      aom_highbd_hadamard_32x32(src_diff, src_stride, coeff);
+      break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+                                ptrdiff_t src_stride, tran_low_t *coeff) {
+  switch (tx_size) {
+    case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+    case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+    case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+    case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+    default: assert(0);
+  }
+}
+
 void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
                     const int16_t *src_diff, int src_stride,
                     tran_low_t *coeff) {
   if (use_hadamard) {
-    switch (tx_size) {
-      case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
-      case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
-      case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
-      case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
-      default: assert(0);
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (bd_info.use_highbitdepth_buf) {
+      highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+    } else {
+      wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
     }
+#else
+    wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
   } else {
     TxfmParam txfm_param;
     txfm_param.tx_type = DCT_DCT;
diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index 247fa3e..2723530 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c
@@ -662,8 +662,7 @@
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int need_search =
-      av1_is_interp_needed(xd) && !cpi->sf.rt_sf.skip_interp_filter_search;
+  const int need_search = av1_is_interp_needed(xd);
   const int ref_frame = xd->mi[0]->ref_frame[0];
   RD_STATS rd_stats_luma, rd_stats;
 
diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index bce494e..9815e0b 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h
@@ -109,6 +109,11 @@
    */
   int skip_motion_mode;
   /*!
+   * Initialized to false. If true, skips interpolation filter search and uses
+   * the default EIGHTTAP_REGULAR.
+   */
+  bool skip_ifs;
+  /*!
    * A pointer to the first element in an array of INTERINTRA_MODE types. This
    * contains the best inter_intra mode for each reference frame.
    */
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 3b5dd75..99b0af2 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -874,16 +874,17 @@
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
     RD_STATS tokenonly_rd_stats;
-    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+    UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx];
 
     // Skip the current mode evaluation if the RD cost derived using the mode
     // signaling rate exceeds the best_rd so far.
     const int mode_rate =
-        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][mode];
+        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
     if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue;
 
-    const int is_diagonal_mode = av1_is_diagonal_mode(get_uv_mode(mode));
-    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+    PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+    const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode);
+    const int is_directional_mode = av1_is_directional_mode(intra_mode);
 
     if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
       continue;
@@ -892,25 +893,26 @@
       continue;
 
     if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
-          (1 << mode)))
+          (1 << uv_mode)))
       continue;
-    if (!intra_mode_cfg->enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
-        mode <= UV_SMOOTH_H_PRED)
+    if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED &&
+        uv_mode <= UV_SMOOTH_H_PRED)
       continue;
 
-    if (!intra_mode_cfg->enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+    if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED)
+      continue;
 
     assert(mbmi->mode < INTRA_MODES);
     if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner &&
-        !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << mode)))
+        !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode)))
       continue;
 
-    mbmi->uv_mode = mode;
+    mbmi->uv_mode = uv_mode;
 
     // Init variables for cfl and angle delta
     const SPEED_FEATURES *sf = &cpi->sf;
     mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-    if (mode == UV_CFL_PRED) {
+    if (uv_mode == UV_CFL_PRED) {
       if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue;
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
@@ -936,18 +938,18 @@
             intra_search_state.directional_mode_skip_mask, is_chroma);
         intra_search_state.dir_mode_skip_mask_ready = 1;
       }
-      if (intra_search_state.directional_mode_skip_mask[mode]) {
+      if (intra_search_state.directional_mode_skip_mask[uv_mode]) {
         continue;
       }
 
       // Search through angle delta
       const int rate_overhead =
-          mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][mode];
+          mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
         continue;
     } else {
-      if (mode == UV_SMOOTH_PRED &&
+      if (uv_mode == UV_SMOOTH_PRED &&
           should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x,
                                                                    bsize))
         continue;
@@ -958,7 +960,7 @@
       }
     }
     const int mode_cost =
-        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][mode];
+        mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
     this_rate = tokenonly_rd_stats.rate +
                 intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 4519e46..107c223 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -576,13 +576,13 @@
   int total_rate = mode_cost;
   const ModeCosts *mode_costs = &x->mode_costs;
   const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
-  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
   // Can only activate one mode.
-  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+  assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
 
   const int try_palette = av1_allow_palette(
       cpi->common.features.allow_screen_content_tools, mbmi->bsize);
-  if (try_palette && mode == UV_DC_PRED) {
+  if (try_palette && uv_mode == UV_DC_PRED) {
     const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
     total_rate +=
         mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
@@ -604,10 +604,11 @@
       total_rate += palette_mode_cost;
     }
   }
-  if (av1_is_directional_mode(get_uv_mode(mode))) {
+  const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+  if (av1_is_directional_mode(intra_mode)) {
     if (av1_use_angle_delta(bsize)) {
       total_rate +=
-          mode_costs->angle_delta_cost[mode - V_PRED]
+          mode_costs->angle_delta_cost[intra_mode - V_PRED]
                                       [mbmi->angle_delta[PLANE_TYPE_UV] +
                                        MAX_ANGLE_DELTA];
     }
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cc39c81..a8b0d10 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -61,30 +61,6 @@
   ms_buffers->obmc_mask = x->obmc_buffer.mask;
 }
 
-static AOM_INLINE SEARCH_METHODS
-get_faster_search_method(SEARCH_METHODS search_method) {
-  // Note on search method's accuracy:
-  //  1. NSTEP
-  //  2. DIAMOND
-  //  3. BIGDIA \approx SQUARE
-  //  4. HEX.
-  //  5. FAST_HEX \approx FAST_DIAMOND
-  switch (search_method) {
-    case NSTEP: return DIAMOND;
-    case NSTEP_8PT: return DIAMOND;
-    case DIAMOND: return BIGDIA;
-    case CLAMPED_DIAMOND: return BIGDIA;
-    case BIGDIA: return HEX;
-    case SQUARE: return HEX;
-    case HEX: return FAST_HEX;
-    case FAST_HEX: return FAST_HEX;
-    case FAST_DIAMOND: return VFAST_DIAMOND;
-    case FAST_BIGDIA: return FAST_BIGDIA;
-    case VFAST_DIAMOND: return VFAST_DIAMOND;
-    default: assert(0 && "Invalid search method!"); return DIAMOND;
-  }
-}
-
 void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
   obmc_buffer->wsrc = NULL;
   obmc_buffer->mask = NULL;
@@ -96,7 +72,7 @@
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
-    int fine_search_interval) {
+    SEARCH_METHODS search_method, int fine_search_interval) {
   const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
   const int is_key_frame =
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
@@ -107,28 +83,6 @@
 
   init_ms_buffers(&ms_params->ms_buffers, x);
 
-  SEARCH_METHODS search_method = mv_sf->search_method;
-  const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
-  const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
-  const int qband = x->qindex >> (QINDEX_BITS - 2);
-  const bool use_faster_search_method =
-      (sf_blk_search_method == 1 && min_dim >= 32) ||
-      (sf_blk_search_method >= 2 && min_dim >= 16 &&
-       x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
-
-  if (use_faster_search_method) {
-    search_method = get_faster_search_method(search_method);
-
-    // We might need to update the search site config since search_method
-    // is changed here.
-    const int ref_stride = ms_params->ms_buffers.ref->stride;
-    if (ref_stride != search_sites[search_method].stride) {
-      av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
-                                     ref_stride);
-      search_sites = x->search_site_cfg_buf;
-    }
-  }
-
   av1_set_mv_search_method(ms_params, search_sites, search_method);
 
   ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
@@ -700,7 +654,8 @@
 }
 
 static INLINE int get_mvpred_var_cost(
-    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+    FULLPEL_MV_STATS *mv_stats) {
   const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const MV sub_this_mv = get_mv_from_fullmv(this_mv);
   const struct buf_2d *const src = ms_params->ms_buffers.src;
@@ -709,13 +664,14 @@
   const int src_stride = src->stride;
   const int ref_stride = ref->stride;
 
-  unsigned unused;
   int bestsme;
 
   bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
-                    ref_stride, &unused);
+                    ref_stride, &mv_stats->sse);
+  mv_stats->distortion = bestsme;
 
-  bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+  mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+  bestsme += mv_stats->err_cost;
 
   return bestsme;
 }
@@ -731,7 +687,8 @@
 }
 
 static INLINE int get_mvpred_compound_var_cost(
-    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+    FULLPEL_MV_STATS *mv_stats) {
   const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -743,23 +700,24 @@
   const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
   const int mask_stride = ms_params->ms_buffers.mask_stride;
   const int invert_mask = ms_params->ms_buffers.inv_mask;
-  unsigned unused;
   int bestsme;
 
   if (mask) {
     bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
                         src_buf, src_stride, second_pred, mask, mask_stride,
-                        invert_mask, &unused);
+                        invert_mask, &mv_stats->sse);
   } else if (second_pred) {
     bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
-                        src_buf, src_stride, &unused, second_pred);
+                        src_buf, src_stride, &mv_stats->sse, second_pred);
   } else {
     bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
-                      ref_stride, &unused);
+                      ref_stride, &mv_stats->sse);
   }
+  mv_stats->distortion = bestsme;
 
   const MV sub_this_mv = get_mv_from_fullmv(this_mv);
-  bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+  mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+  bestsme += mv_stats->err_cost;
 
   return bestsme;
 }
@@ -803,13 +761,15 @@
   const int br = best_mv.row;
   const int bc = best_mv.col;
 
-  cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv);
+  FULLPEL_MV_STATS mv_stats;
+  cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats);
 
   if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
     for (int i = 0; i < 4; i++) {
       const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
                                        bc + neighbors[i].col };
-      cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+      cost_list[i + 1] =
+          get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
     }
   } else {
     for (int i = 0; i < 4; i++) {
@@ -818,7 +778,8 @@
       if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
         cost_list[i + 1] = INT_MAX;
       } else {
-        cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+        cost_list[i + 1] =
+            get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
       }
     }
   }
@@ -1055,7 +1016,8 @@
 static int pattern_search(FULLPEL_MV start_mv,
                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                           int search_step, const int do_init_search,
-                          int *cost_list, FULLPEL_MV *best_mv) {
+                          int *cost_list, FULLPEL_MV *best_mv,
+                          FULLPEL_MV_STATS *best_mv_stats) {
   static const int search_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
@@ -1278,7 +1240,7 @@
     }
   }
 
-  const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
+  const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats);
   return var_cost;
 }
 
@@ -1296,61 +1258,68 @@
 static int hex_search(const FULLPEL_MV start_mv,
                       const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                       const int search_step, const int do_init_search,
-                      int *cost_list, FULLPEL_MV *best_mv) {
+                      int *cost_list, FULLPEL_MV *best_mv,
+                      FULLPEL_MV_STATS *best_mv_stats) {
   return pattern_search(start_mv, ms_params, search_step, do_init_search,
-                        cost_list, best_mv);
+                        cost_list, best_mv, best_mv_stats);
 }
 
 static int bigdia_search(const FULLPEL_MV start_mv,
                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                          const int search_step, const int do_init_search,
-                         int *cost_list, FULLPEL_MV *best_mv) {
+                         int *cost_list, FULLPEL_MV *best_mv,
+                         FULLPEL_MV_STATS *best_mv_stats) {
   return pattern_search(start_mv, ms_params, search_step, do_init_search,
-                        cost_list, best_mv);
+                        cost_list, best_mv, best_mv_stats);
 }
 
 static int square_search(const FULLPEL_MV start_mv,
                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                          const int search_step, const int do_init_search,
-                         int *cost_list, FULLPEL_MV *best_mv) {
+                         int *cost_list, FULLPEL_MV *best_mv,
+                         FULLPEL_MV_STATS *best_mv_stats) {
   return pattern_search(start_mv, ms_params, search_step, do_init_search,
-                        cost_list, best_mv);
+                        cost_list, best_mv, best_mv_stats);
 }
 
 static int fast_hex_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                            const int search_step, const int do_init_search,
-                           int *cost_list, FULLPEL_MV *best_mv) {
+                           int *cost_list, FULLPEL_MV *best_mv,
+                           FULLPEL_MV_STATS *best_mv_stats) {
   return hex_search(start_mv, ms_params,
                     AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
-                    cost_list, best_mv);
+                    cost_list, best_mv, best_mv_stats);
 }
 
 static int vfast_dia_search(const FULLPEL_MV start_mv,
                             const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                             const int search_step, const int do_init_search,
-                            int *cost_list, FULLPEL_MV *best_mv) {
+                            int *cost_list, FULLPEL_MV *best_mv,
+                            FULLPEL_MV_STATS *best_mv_stats) {
   return bigdia_search(start_mv, ms_params,
                        AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
-                       do_init_search, cost_list, best_mv);
+                       do_init_search, cost_list, best_mv, best_mv_stats);
 }
 
 static int fast_dia_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                            const int search_step, const int do_init_search,
-                           int *cost_list, FULLPEL_MV *best_mv) {
+                           int *cost_list, FULLPEL_MV *best_mv,
+                           FULLPEL_MV_STATS *best_mv_stats) {
   return bigdia_search(start_mv, ms_params,
                        AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
-                       do_init_search, cost_list, best_mv);
+                       do_init_search, cost_list, best_mv, best_mv_stats);
 }
 
 static int fast_bigdia_search(const FULLPEL_MV start_mv,
                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                               const int search_step, const int do_init_search,
-                              int *cost_list, FULLPEL_MV *best_mv) {
+                              int *cost_list, FULLPEL_MV *best_mv,
+                              FULLPEL_MV_STATS *best_mv_stats) {
   return bigdia_search(start_mv, ms_params,
                        AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
-                       do_init_search, cost_list, best_mv);
+                       do_init_search, cost_list, best_mv, best_mv_stats);
 }
 
 static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
@@ -1528,7 +1497,9 @@
 static int full_pixel_diamond(FULLPEL_MV start_mv,
                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                               const int step_param, int *cost_list,
-                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+                              FULLPEL_MV *best_mv,
+                              FULLPEL_MV_STATS *best_mv_stats,
+                              FULLPEL_MV *second_best_mv) {
   const search_site_config *cfg = ms_params->search_sites;
   int thissme, n, num00 = 0;
 
@@ -1539,7 +1510,7 @@
   diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv,
                      second_best_mv);
 
-  int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv);
+  int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats);
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
@@ -1550,14 +1521,17 @@
     // TODO(chiyotsai@google.com): There is another bug here where the second
     // best mv gets incorrectly overwritten. Fix it later.
     FULLPEL_MV tmp_best_mv;
+    FULLPEL_MV_STATS tmp_best_mv_stats;
     diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n,
                        &num00, &tmp_best_mv, second_best_mv);
 
-    thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv);
+    thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv,
+                                           &tmp_best_mv_stats);
 
     if (thissme < bestsme) {
       bestsme = thissme;
       *best_mv = tmp_best_mv;
+      *best_mv_stats = tmp_best_mv_stats;
     }
 
     if (num00) {
@@ -1658,6 +1632,7 @@
                                  const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                                  const struct MESH_PATTERN *const mesh_patterns,
                                  int *cost_list, FULLPEL_MV *best_mv,
+                                 FULLPEL_MV_STATS *mv_stats,
                                  FULLPEL_MV *second_best_mv) {
   const int kMinRange = 7;
   const int kMaxRange = 256;
@@ -1717,7 +1692,7 @@
   }
 
   if (bestsme < INT_MAX) {
-    bestsme = get_mvpred_var_cost(ms_params, best_mv);
+    bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats);
   }
 
   // Return cost list.
@@ -1809,7 +1784,8 @@
 int av1_full_pixel_search(const FULLPEL_MV start_mv,
                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                           const int step_param, int *cost_list,
-                          FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+                          FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+                          FULLPEL_MV *second_best_mv) {
   const BLOCK_SIZE bsize = ms_params->bsize;
   const SEARCH_METHODS search_method = ms_params->search_method;
 
@@ -1831,41 +1807,43 @@
   }
 
   assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride);
+  assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width);
 
   switch (search_method) {
     case FAST_BIGDIA:
       var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
-                               best_mv);
+                               best_mv, best_mv_stats);
       break;
     case VFAST_DIAMOND:
       var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
-                             best_mv);
+                             best_mv, best_mv_stats);
       break;
     case FAST_DIAMOND:
       var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
-                            best_mv);
+                            best_mv, best_mv_stats);
       break;
     case FAST_HEX:
       var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
-                            best_mv);
+                            best_mv, best_mv_stats);
       break;
     case HEX:
-      var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv,
+                       best_mv_stats);
       break;
     case SQUARE:
-      var =
-          square_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      var = square_search(start_mv, ms_params, step_param, 1, cost_list,
+                          best_mv, best_mv_stats);
       break;
     case BIGDIA:
-      var =
-          bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list,
+                          best_mv, best_mv_stats);
       break;
     case NSTEP:
     case NSTEP_8PT:
     case DIAMOND:
     case CLAMPED_DIAMOND:
       var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
-                               best_mv, second_best_mv);
+                               best_mv, best_mv_stats, second_best_mv);
       break;
     default: assert(0 && "Invalid search method.");
   }
@@ -1922,13 +1900,15 @@
       new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
 
       return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
-                                   cost_list, best_mv, second_best_mv);
+                                   cost_list, best_mv, best_mv_stats,
+                                   second_best_mv);
     }
   }
 
   if (run_mesh_search) {
     int var_ex;
     FULLPEL_MV tmp_mv_ex;
+    FULLPEL_MV_STATS tmp_mv_stats;
     // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
     // or non-intraBC)
     // TODO(chiyotsai@google.com):  There is a bug here where the second best mv
@@ -1937,10 +1917,12 @@
         ms_params->mesh_patterns[is_intra_mode];
     // TODO(chiyotsai@google.com): the second best mv is not set correctly by
     // full_pixel_exhaustive, which can incorrectly override it.
-    var_ex = full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns,
-                                   cost_list, &tmp_mv_ex, second_best_mv);
+    var_ex =
+        full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list,
+                              &tmp_mv_ex, &tmp_mv_stats, second_best_mv);
     if (var_ex < var) {
       var = var_ex;
+      *best_mv_stats = tmp_mv_stats;
       *best_mv = tmp_mv_ex;
     }
   }
@@ -2000,7 +1982,8 @@
       hash_mv.col = ref_block_hash.x - x_pos;
       hash_mv.row = ref_block_hash.y - y_pos;
       if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
-      const int refCost = get_mvpred_var_cost(ms_params, &hash_mv);
+      FULLPEL_MV_STATS mv_stats;
+      const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats);
       if (refCost < best_hash_cost) {
         best_hash_cost = refCost;
         *best_mv = hash_mv;
@@ -2011,12 +1994,27 @@
   return best_hash_cost;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
+                        int full_search, int *sad) {
   int best_sad = INT_MAX;
   int this_sad;
   int d;
   int center, offset = 0;
-  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  int bw = search_size << 1;
+
+  if (full_search) {
+    for (d = 0; d <= bw; d++) {
+      this_sad = aom_vector_var(&ref[d], src, bwl);
+      if (this_sad < best_sad) {
+        best_sad = this_sad;
+        offset = d;
+      }
+    }
+    center = offset;
+    *sad = best_sad;
+    return (center - (bw >> 1));
+  }
+
   for (d = 0; d <= bw; d += 16) {
     this_sad = aom_vector_var(&ref[d], src, bwl);
     if (this_sad < best_sad) {
@@ -2072,31 +2070,47 @@
       center = this_pos;
     }
   }
-
+  *sad = best_sad;
   return (center - (bw >> 1));
 }
 
-// A special fast version of motion search used in rt mode
+// A special fast version of motion search used in rt mode.
+// The search window along columns and row is given by:
+//  +/- me_search_size_col/row.
 unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col, const MV *ref_mv) {
+                                           int mi_col, const MV *ref_mv,
+                                           unsigned int *y_sad_zero,
+                                           int me_search_size_col,
+                                           int me_search_size_row) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  DECLARE_ALIGNED(16, int16_t, hbuf[256]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[256]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
   int idx;
-  const int bw = 4 << mi_size_wide_log2[bsize];
-  const int bh = 4 << mi_size_high_log2[bsize];
-  const int search_width = bw << 1;
-  const int search_height = bh << 1;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+  const int full_search = is_screen;
+  const bool screen_scroll_superblock =
+      is_screen && bsize == cm->seq_params->sb_size;
+  // Keep border a multiple of 16.
+  const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+  int search_size_width = me_search_size_col;
+  int search_size_height = me_search_size_row;
+  // Adjust based on boundary.
+  if (((mi_col << 2) - search_size_width < -border) ||
+      ((mi_col << 2) + search_size_width > cm->width + border))
+    search_size_width = border;
+  if (((mi_row << 2) - search_size_height < -border) ||
+      ((mi_row << 2) + search_size_height > cm->height + border))
+    search_size_height = border;
   const int src_stride = x->plane[0].src.stride;
   const int ref_stride = xd->plane[0].pre[0].stride;
   uint8_t const *ref_buf, *src_buf;
   int_mv *best_int_mv = &xd->mi[0]->mv[0];
   unsigned int best_sad, tmp_sad, this_sad[4];
+  int best_sad_col, best_sad_row;
   const int row_norm_factor = mi_size_high_log2[bsize] + 1;
   const int col_norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
@@ -2129,13 +2143,29 @@
     }
     return best_sad;
   }
+  const int width_ref_buf = (search_size_width << 1) + bw;
+  const int height_ref_buf = (search_size_height << 1) + bh;
+  int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf));
+  int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf));
+  int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+  int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+  if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+    aom_free(hbuf);
+    aom_free(vbuf);
+    aom_free(src_hbuf);
+    aom_free(src_vbuf);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+  }
 
-  // Set up prediction 1-D reference set
-  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  aom_int_pro_row(hbuf, ref_buf, ref_stride, search_width, bh, row_norm_factor);
+  // Set up prediction 1-D reference set for rows.
+  ref_buf = xd->plane[0].pre[0].buf - search_size_width;
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh,
+                  row_norm_factor);
 
-  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, search_height,
+  // Set up prediction 1-D reference set for cols
+  ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride;
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf,
                   col_norm_factor);
 
   // Set up src 1-D reference set
@@ -2145,9 +2175,19 @@
 
   // Find the best match per 1-D search
   best_int_mv->as_fullmv.col =
-      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
+                   full_search, &best_sad_col);
   best_int_mv->as_fullmv.row =
-      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
+                   full_search, &best_sad_row);
+
+  // For screen: select between horiz or vert motion.
+  if (is_screen) {
+    if (best_sad_col < best_sad_row)
+      best_int_mv->as_fullmv.row = 0;
+    else
+      best_int_mv->as_fullmv.col = 0;
+  }
 
   FULLPEL_MV this_mv = best_int_mv->as_fullmv;
   src_buf = x->plane[0].src.buf;
@@ -2159,16 +2199,18 @@
   if (best_int_mv->as_int != 0) {
     tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
                                           xd->plane[0].pre[0].buf, ref_stride);
-
+    *y_sad_zero = tmp_sad;
     if (tmp_sad < best_sad) {
       best_int_mv->as_fullmv = kZeroFullMv;
       this_mv = best_int_mv->as_fullmv;
       ref_buf = xd->plane[0].pre[0].buf;
       best_sad = tmp_sad;
     }
+  } else {
+    *y_sad_zero = best_sad;
   }
 
-  {
+  if (!screen_scroll_superblock) {
     const uint8_t *const pos[4] = {
       ref_buf - ref_stride,
       ref_buf - 1,
@@ -2178,33 +2220,33 @@
 
     cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
                                    this_sad);
-  }
 
-  for (idx = 0; idx < 4; ++idx) {
-    if (this_sad[idx] < best_sad) {
-      best_sad = this_sad[idx];
-      best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
-      best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+    for (idx = 0; idx < 4; ++idx) {
+      if (this_sad[idx] < best_sad) {
+        best_sad = this_sad[idx];
+        best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+        best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+      }
     }
-  }
 
-  if (this_sad[0] < this_sad[3])
-    this_mv.row -= 1;
-  else
-    this_mv.row += 1;
+    if (this_sad[0] < this_sad[3])
+      this_mv.row -= 1;
+    else
+      this_mv.row += 1;
 
-  if (this_sad[1] < this_sad[2])
-    this_mv.col -= 1;
-  else
-    this_mv.col += 1;
+    if (this_sad[1] < this_sad[2])
+      this_mv.col -= 1;
+    else
+      this_mv.col += 1;
 
-  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+    ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
 
-  tmp_sad =
-      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-  if (best_sad > tmp_sad) {
-    best_int_mv->as_fullmv = this_mv;
-    best_sad = tmp_sad;
+    tmp_sad =
+        cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+    if (best_sad > tmp_sad) {
+      best_int_mv->as_fullmv = this_mv;
+      best_sad = tmp_sad;
+    }
   }
 
   FullMvLimits mv_limits = x->mv_limits;
@@ -2218,6 +2260,10 @@
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
   }
 
+  aom_free(hbuf);
+  aom_free(vbuf);
+  aom_free(src_hbuf);
+  aom_free(src_vbuf);
   return best_sad;
 }
 
@@ -2960,8 +3006,9 @@
 
 int av1_find_best_sub_pixel_tree_pruned_more(
     MACROBLOCKD *xd, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
-    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+    unsigned int *sse1, int_mv *last_mv_search_list) {
   (void)cm;
   const int allow_hp = ms_params->allow_hp;
   const int forced_stop = ms_params->forced_stop;
@@ -2982,8 +3029,16 @@
                                              ? &cm->sf_identity
                                              : xd->block_ref_scale_factors[0];
   const int is_scaled = av1_is_scaled(sf);
-  besterr = setup_center_error_facade(
-      xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+  if (start_mv_stats != NULL && !is_scaled) {
+    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+    *distortion = start_mv_stats->distortion;
+    *sse1 = start_mv_stats->sse;
+  } else {
+    besterr =
+        setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+                                  sse1, distortion, is_scaled);
+  }
 
   // If forced_stop is FULL_PEL, return.
   if (forced_stop == FULL_PEL) return besterr;
@@ -3045,9 +3100,11 @@
 
 int av1_find_best_sub_pixel_tree_pruned(
     MACROBLOCKD *xd, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
-    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+    unsigned int *sse1, int_mv *last_mv_search_list) {
   (void)cm;
+  (void)start_mv_stats;
   const int allow_hp = ms_params->allow_hp;
   const int forced_stop = ms_params->forced_stop;
   const int iters_per_step = ms_params->iters_per_step;
@@ -3067,8 +3124,16 @@
                                              ? &cm->sf_identity
                                              : xd->block_ref_scale_factors[0];
   const int is_scaled = av1_is_scaled(sf);
-  besterr = setup_center_error_facade(
-      xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+  if (start_mv_stats != NULL && !is_scaled) {
+    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+    *distortion = start_mv_stats->distortion;
+    *sse1 = start_mv_stats->sse;
+  } else {
+    besterr =
+        setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+                                  sse1, distortion, is_scaled);
+  }
 
   // If forced_stop is FULL_PEL, return.
   if (forced_stop == FULL_PEL) return besterr;
@@ -3181,9 +3246,12 @@
 
 int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                 MV start_mv, MV *bestmv, int *distortion,
+                                 MV start_mv,
+                                 const FULLPEL_MV_STATS *start_mv_stats,
+                                 MV *bestmv, int *distortion,
                                  unsigned int *sse1,
                                  int_mv *last_mv_search_list) {
+  (void)start_mv_stats;
   const int allow_hp = ms_params->allow_hp;
   const int forced_stop = ms_params->forced_stop;
   const int iters_per_step = ms_params->iters_per_step;
@@ -3207,12 +3275,18 @@
                                              : xd->block_ref_scale_factors[0];
   const int is_scaled = av1_is_scaled(sf);
 
-  if (subpel_search_type != USE_2_TAPS_ORIG) {
-    besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
-                                           mv_cost_params, sse1, distortion);
+  if (start_mv_stats != NULL && !is_scaled) {
+    besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+    *distortion = start_mv_stats->distortion;
+    *sse1 = start_mv_stats->sse;
   } else {
-    besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                                 distortion);
+    if (subpel_search_type != USE_2_TAPS_ORIG) {
+      besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                             mv_cost_params, sse1, distortion);
+    } else {
+      besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                                   distortion);
+    }
   }
 
   // If forced_stop is FULL_PEL, return.
@@ -3255,12 +3329,14 @@
 // Returns the maximum MV.
 int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                MV start_mv, MV *bestmv, int *distortion,
-                                unsigned int *sse1,
+                                MV start_mv,
+                                const FULLPEL_MV_STATS *start_mv_stats,
+                                MV *bestmv, int *distortion, unsigned int *sse1,
                                 int_mv *last_mv_search_list) {
   (void)xd;
   (void)cm;
   (void)start_mv;
+  (void)start_mv_stats;
   (void)sse1;
   (void)distortion;
   (void)last_mv_search_list;
@@ -3282,12 +3358,14 @@
 // Returns the minimum MV.
 int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                MV start_mv, MV *bestmv, int *distortion,
-                                unsigned int *sse1,
+                                MV start_mv,
+                                const FULLPEL_MV_STATS *start_mv_stats,
+                                MV *bestmv, int *distortion, unsigned int *sse1,
                                 int_mv *last_mv_search_list) {
   (void)xd;
   (void)cm;
   (void)start_mv;
+  (void)start_mv_stats;
   (void)sse1;
   (void)distortion;
   (void)last_mv_search_list;
@@ -3814,9 +3892,11 @@
 
 int av1_find_best_obmc_sub_pixel_tree_up(
     MACROBLOCKD *xd, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
-    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+    const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+    unsigned int *sse1, int_mv *last_mv_search_list) {
   (void)last_mv_search_list;
+  (void)start_mv_stats;
   const int allow_hp = ms_params->allow_hp;
   const int forced_stop = ms_params->forced_stop;
   const int iters_per_step = ms_params->iters_per_step;
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 6b9af07..87b9309 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -140,13 +140,19 @@
   aom_sad_multi_d_fn_t sdx3df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
+typedef struct {
+  int err_cost;
+  unsigned int distortion;
+  unsigned int sse;
+} FULLPEL_MV_STATS;
+
 void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
-    int fine_search_interval);
+    SEARCH_METHODS search_method, int fine_search_interval);
 
 /*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */
 void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -256,10 +262,10 @@
 
 int av1_init_search_range(int size);
 
-unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
-                                           MACROBLOCK *x, BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col,
-                                           const MV *ref_mv);
+unsigned int av1_int_pro_motion_estimation(
+    const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
+    int me_search_size_col, int me_search_size_row);
 
 int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                              const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
@@ -267,7 +273,8 @@
 int av1_full_pixel_search(const FULLPEL_MV start_mv,
                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                           const int step_param, int *cost_list,
-                          FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv);
+                          FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+                          FULLPEL_MV *second_best_mv);
 
 int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
                             const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -325,7 +332,9 @@
 
 typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                   MV start_mv, MV *bestmv, int *distortion,
+                                   MV start_mv,
+                                   const FULLPEL_MV_STATS *start_mv_stats,
+                                   MV *bestmv, int *distortion,
                                    unsigned int *sse1,
                                    int_mv *last_mv_search_list);
 
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index b8c000b..e7eec29 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -239,14 +239,16 @@
   // the stride of the reference frame can be different from indicated by
   // MotionVectorSearchParams::search_site_cfg. When this happens, we need to
   // readjust the stride.
-  const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, mv_sf, bsize);
   const search_site_config *src_search_site_cfg =
       av1_get_search_site_config(cpi, x, search_method);
 
   // Further reduce the search range.
   if (search_range < INT_MAX) {
     const search_site_config *search_site_cfg =
-        &src_search_site_cfg[search_method_lookup[cpi->sf.mv_sf.search_method]];
+        &src_search_site_cfg[search_method_lookup[search_method]];
     // Max step_param is search_site_cfg->num_search_steps.
     if (search_range < 1) {
       step_param = search_site_cfg->num_search_steps;
@@ -259,6 +261,7 @@
   }
 
   int cost_list[5];
+  FULLPEL_MV_STATS best_mv_stats;
   int_mv second_best_mv;
   best_mv->as_int = second_best_mv.as_int = INVALID_MV;
 
@@ -273,21 +276,23 @@
       for (int m = 0; m < cand_cnt; m++) {
         int_mv smv = cand[m].fmv;
         FULLPEL_MV this_best_mv, this_second_best_mv;
+        FULLPEL_MV_STATS this_mv_stats;
 
         if (smv.as_int == INVALID_MV) continue;
 
         av1_make_default_fullpel_ms_params(
             &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv,
-            src_search_site_cfg, fine_search_interval);
+            src_search_site_cfg, search_method, fine_search_interval);
 
         const int thissme =
             av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
                                   cond_cost_list(cpi, cost_list), &this_best_mv,
-                                  &this_second_best_mv);
+                                  &this_mv_stats, &this_second_best_mv);
 
         if (thissme < bestsme) {
           bestsme = thissme;
           best_mv->as_fullmv = this_best_mv;
+          best_mv_stats = this_mv_stats;
           second_best_mv.as_fullmv = this_second_best_mv;
         }
 
@@ -298,7 +303,7 @@
     case OBMC_CAUSAL:
       av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
                                          &ref_mv, start_mv, src_search_site_cfg,
-                                         fine_search_interval);
+                                         search_method, fine_search_interval);
 
       bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
                                            step_param, &best_mv->as_fullmv);
@@ -385,13 +390,13 @@
 
     switch (mbmi->motion_mode) {
       case SIMPLE_TRANSLATION:
-        if (cpi->sf.mv_sf.use_accurate_subpel_search) {
+        if (mv_sf->use_accurate_subpel_search) {
           const int try_second = second_best_mv.as_int != INVALID_MV &&
                                  second_best_mv.as_int != best_mv->as_int &&
-                                 (cpi->sf.mv_sf.disable_second_mv <= 1);
+                                 (mv_sf->disable_second_mv <= 1);
           const int best_mv_var = mv_search_params->find_fractional_mv_step(
-              xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
-              &x->pred_sse[ref], fractional_ms_list);
+              xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+              &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list);
 
           if (try_second) {
             struct macroblockd_plane *p = xd->plane;
@@ -400,7 +405,7 @@
               { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
             };
             int64_t rd = INT64_MAX;
-            if (!cpi->sf.mv_sf.disable_second_mv) {
+            if (!mv_sf->disable_second_mv) {
               // Calculate actual rd cost.
               mbmi->mv[0].as_mv = best_mv->as_mv;
               av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
@@ -423,10 +428,10 @@
                                          subpel_start_mv)) {
               unsigned int sse;
               const int this_var = mv_search_params->find_fractional_mv_step(
-                  xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
-                  &sse, fractional_ms_list);
+                  xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv,
+                  &dis, &sse, fractional_ms_list);
 
-              if (!cpi->sf.mv_sf.disable_second_mv) {
+              if (!mv_sf->disable_second_mv) {
                 // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
                 // to choose the better MV.
                 mbmi->mv[0].as_mv = this_best_mv;
@@ -459,14 +464,14 @@
           }
         } else {
           mv_search_params->find_fractional_mv_step(
-              xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
-              &x->pred_sse[ref], NULL);
+              xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+              &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL);
         }
         break;
       case OBMC_CAUSAL:
-        av1_find_best_obmc_sub_pixel_tree_up(xd, cm, &ms_params,
-                                             subpel_start_mv, &best_mv->as_mv,
-                                             &dis, &x->pred_sse[ref], NULL);
+        av1_find_best_obmc_sub_pixel_tree_up(
+            xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis,
+            &x->pred_sse[ref], NULL);
         break;
       default: assert(0 && "Invalid motion mode!\n");
     }
@@ -621,25 +626,28 @@
 
     // Make motion search params
     FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
-    const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+    FULLPEL_MV_STATS best_mv_stats;
+    const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+    const SEARCH_METHODS search_method =
+        av1_get_default_mv_search_method(x, mv_sf, bsize);
     const search_site_config *src_search_sites =
         av1_get_search_site_config(cpi, x, search_method);
     // Use the mv result from the single mode as mv predictor.
     const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
     av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
                                        &ref_mv[id].as_mv, start_fullmv,
-                                       src_search_sites,
+                                       src_search_sites, search_method,
                                        /*fine_search_interval=*/0);
 
     av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                              mask_stride, id);
 
     // Small-range full-pixel motion search.
-    if (!cpi->sf.mv_sf.disable_extensive_joint_motion_search &&
+    if (!mv_sf->disable_extensive_joint_motion_search &&
         mbmi->interinter_comp.type != COMPOUND_WEDGE) {
-      bestsme =
-          av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
-                                &best_mv.as_fullmv, &second_best_mv.as_fullmv);
+      bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                      &best_mv.as_fullmv, &best_mv_stats,
+                                      &second_best_mv.as_fullmv);
     } else {
       bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
                                          &best_mv.as_fullmv);
@@ -683,15 +691,15 @@
       MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
       assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
       bestsme = cpi->mv_search_params.find_fractional_mv_step(
-          xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+          xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL);
 
       if (try_second) {
         MV this_best_mv;
         MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
         if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
           const int thissme = cpi->mv_search_params.find_fractional_mv_step(
-              xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis, &sse,
-              NULL);
+              xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis,
+              &sse, NULL);
           if (thissme < bestsme) {
             best_mv.as_mv = this_best_mv;
             bestsme = thissme;
@@ -775,14 +783,16 @@
 
   // Make motion search params
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
-  const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+  FULLPEL_MV_STATS best_mv_stats;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
   const search_site_config *src_search_sites =
       av1_get_search_site_config(cpi, x, search_method);
   // Use the mv result from the single mode as mv predictor.
   const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
                                      &ref_mv.as_mv, start_fullmv,
-                                     src_search_sites,
+                                     src_search_sites, search_method,
                                      /*fine_search_interval=*/0);
 
   av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
@@ -790,7 +800,7 @@
 
   // Small-range full-pixel motion search.
   bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
-                                  &best_mv.as_fullmv, NULL);
+                                  &best_mv.as_fullmv, &best_mv_stats, NULL);
 
   if (scaled_ref_frame) {
     // Swap back the original buffers for subpel motion search for the 0th slot.
@@ -816,7 +826,8 @@
     MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
     bestsme = cpi->mv_search_params.find_fractional_mv_step(
-        xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+        xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis,
+        &sse, NULL);
   }
 
   // Restore the pointer to the first unscaled prediction buffer.
@@ -954,10 +965,12 @@
   return tmp_rate_mv;
 }
 
-int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                                int mi_col, BLOCK_SIZE bsize, int ref,
-                                FULLPEL_MV start_mv, int num_planes,
-                                int use_subpixel) {
+int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize, int ref,
+                                        FULLPEL_MV start_mv, int num_planes,
+                                        int use_subpixel, unsigned int *sse,
+                                        unsigned int *var) {
   assert(num_planes == 1 &&
          "Currently simple_motion_search only supports luma plane");
   assert(!frame_is_intra_only(&cpi->common) &&
@@ -986,8 +999,9 @@
              MAX_MVSEARCH_STEPS - 2);
   int cost_list[5];
   const int ref_idx = 0;
-  int var;
+  int bestsme;
   int_mv best_mv;
+  FULLPEL_MV_STATS best_mv_stats;
 
   av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
                        get_ref_scale_factors(cm, ref), num_planes);
@@ -1001,20 +1015,23 @@
   // Allow more mesh searches for screen content type on the ARF.
   const int fine_search_interval = use_fine_search_interval(cpi);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
-  const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, mv_sf, bsize);
   const search_site_config *src_search_sites =
       av1_get_search_site_config(cpi, x, search_method);
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     start_mv, src_search_sites,
+                                     start_mv, src_search_sites, search_method,
                                      fine_search_interval);
 
-  var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
-                              cond_cost_list(cpi, cost_list),
-                              &best_mv.as_fullmv, NULL);
+  bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                                  cond_cost_list(cpi, cost_list),
+                                  &best_mv.as_fullmv, &best_mv_stats, NULL);
 
   const int use_subpel_search =
-      var < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
-      use_subpixel;
+      bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+      use_subpixel &&
+      (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL);
   if (scaled_ref_frame) {
     xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
   }
@@ -1025,50 +1042,30 @@
     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
                                       cost_list);
     // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
-    ms_params.forced_stop = cpi->sf.mv_sf.simple_motion_subpel_force_stop;
+    ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop;
 
     MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
 
     cpi->mv_search_params.find_fractional_mv_step(
-        xd, cm, &ms_params, subpel_start_mv, &best_mv.as_mv, &not_used,
-        &x->pred_sse[ref], NULL);
+        xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv,
+        &not_used, &x->pred_sse[ref], NULL);
+
+    mbmi->mv[0] = best_mv;
+
+    // Get a copy of the prediction output
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+    *var = cpi->ppi->fn_ptr[bsize].vf(
+        x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+        xd->plane[0].dst.stride, sse);
   } else {
     // Manually convert from units of pixel to 1/8-pixels if we are not doing
     // subpel search
     convert_fullmv_to_mv(&best_mv);
+    *var = best_mv_stats.distortion;
+    *sse = best_mv_stats.sse;
   }
 
-  mbmi->mv[0] = best_mv;
-
-  // Get a copy of the prediction output
-  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                AOM_PLANE_Y, AOM_PLANE_Y);
-
-  if (scaled_ref_frame) {
-    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
-  }
-
-  return best_mv;
-}
-
-int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize,
-                                 const FULLPEL_MV start_mv, int use_subpixel,
-                                 unsigned int *sse, unsigned int *var) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  const MV_REFERENCE_FRAME ref =
-      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
-
-  int_mv best_mv = av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
-                                            start_mv, 1, use_subpixel);
-
-  const uint8_t *src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *dst = xd->plane[0].dst.buf;
-  const int dst_stride = xd->plane[0].dst.stride;
-
-  *var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
-
   return best_mv;
 }
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index d2996bc..d1fa915 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -59,18 +59,14 @@
                                       int *rate_mv, int ref_idx);
 
 // Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
-// ref. Note that this sets the offset of mbmi, so we will need to reset it
-// after calling this function.
-int_mv av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
-                                int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                int ref, FULLPEL_MV start_mv, int num_planes,
-                                int use_subpixel);
-
-// Performs a simple motion search to calculate the sse and var of the residue
-int_mv av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
-                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                 const FULLPEL_MV start_mv, int use_subpixel,
-                                 unsigned int *sse, unsigned int *var);
+// ref and calculates the sse and var of the residue. Note that this sets the
+// offset of mbmi, so we will need to reset it after calling this function.
+int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize, int ref,
+                                        const FULLPEL_MV start_mv,
+                                        int num_planes, int use_subpixel,
+                                        unsigned int *sse, unsigned int *var);
 
 static AOM_INLINE const search_site_config *av1_get_search_site_config(
     const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
@@ -101,6 +97,47 @@
   return x->search_site_cfg_buf;
 }
 
+static AOM_INLINE SEARCH_METHODS
+av1_get_faster_search_method(SEARCH_METHODS search_method) {
+  // Note on search method's accuracy:
+  //  1. NSTEP
+  //  2. DIAMOND
+  //  3. BIGDIA \approx SQUARE
+  //  4. HEX.
+  //  5. FAST_HEX \approx FAST_DIAMOND
+  switch (search_method) {
+    case NSTEP: return DIAMOND;
+    case NSTEP_8PT: return DIAMOND;
+    case DIAMOND: return BIGDIA;
+    case CLAMPED_DIAMOND: return BIGDIA;
+    case BIGDIA: return HEX;
+    case SQUARE: return HEX;
+    case HEX: return FAST_HEX;
+    case FAST_HEX: return FAST_HEX;
+    case FAST_DIAMOND: return VFAST_DIAMOND;
+    case FAST_BIGDIA: return FAST_BIGDIA;
+    case VFAST_DIAMOND: return VFAST_DIAMOND;
+    default: assert(0 && "Invalid search method!"); return DIAMOND;
+  }
+}
+
+static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method(
+    const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) {
+  SEARCH_METHODS search_method = mv_sf->search_method;
+  const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
+  const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+  const int qband = x->qindex >> (QINDEX_BITS - 2);
+  const bool use_faster_search_method =
+      (sf_blk_search_method == 1 && min_dim >= 32) ||
+      (sf_blk_search_method >= 2 && min_dim >= 16 &&
+       x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
+
+  if (use_faster_search_method) {
+    search_method = av1_get_faster_search_method(search_method);
+  }
+  return search_method;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/nonrd_opt.h b/av1/encoder/nonrd_opt.h
index 36a0c23..4fd88a7 100644
--- a/av1/encoder/nonrd_opt.h
+++ b/av1/encoder/nonrd_opt.h
@@ -103,6 +103,8 @@
   int use_ref_frame_mask[REF_FRAMES];
   //! Array to hold flags of evaluated modes for each reference frame
   uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+  //! Array to hold flag indicating if scaled reference frame is used.
+  bool use_scaled_ref_frame[REF_FRAMES];
 } InterModeSearchStateNonrd;
 
 static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
@@ -412,30 +414,36 @@
  *                                        data for the current macroblock
  * \param[in]    ref_frame                Reference frame for which to find
  *                                        ref MVs
- * \param[in]    frame_mv                 Predicted MVs for a block
+ * \param[out]   frame_mv                 Predicted MVs for a block
  * \param[in]    yv12_mb                  Buffer to hold predicted block
  * \param[in]    bsize                    Current block size
  * \param[in]    force_skip_low_temp_var  Flag indicating possible mode search
  *                                        prune for low temporal variance block
  * \param[in]    skip_pred_mv             Flag indicating to skip av1_mv_pred
+ * \param[out]   use_scaled_ref_frame     Flag to indicate if scaled reference
+ *                                        frame is used.
  *
  * \remark Nothing is returned. Instead, predicted MVs are placed into
- * \c frame_mv array
+ * \c frame_mv array, and use_scaled_ref_frame is set.
  */
-static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
-                                   MV_REFERENCE_FRAME ref_frame,
-                                   int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
-                                   struct buf_2d yv12_mb[8][MAX_MB_PLANE],
-                                   BLOCK_SIZE bsize,
-                                   int force_skip_low_temp_var,
-                                   int skip_pred_mv) {
+static INLINE void find_predictors(
+    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+    int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+  int base_is_key_frame = 0;
+  const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame);
+  const bool ref_is_scaled =
+      ref->y_crop_height != cm->height || ref->y_crop_width != cm->width;
+  const YV12_BUFFER_CONFIG *scaled_ref =
+      av1_get_scaled_ref_frame(cpi, ref_frame);
+  const YV12_BUFFER_CONFIG *yv12 =
+      ref_is_scaled && scaled_ref ? scaled_ref : ref;
   const int num_planes = av1_num_planes(cm);
-
   x->pred_mv_sad[ref_frame] = INT_MAX;
   x->pred_mv0_sad[ref_frame] = INT_MAX;
   x->pred_mv1_sad[ref_frame] = INT_MAX;
@@ -443,8 +451,15 @@
   // TODO(kyslov) this needs various further optimizations. to be continued..
   assert(yv12 != NULL);
   if (yv12 != NULL) {
-    const struct scale_factors *const sf =
-        get_ref_scale_factors_const(cm, ref_frame);
+    if (cpi->ppi->use_svc) {
+      SVC *const svc = &cpi->svc;
+      const int layer =
+          LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                           svc->number_temporal_layers);
+      base_is_key_frame = svc->layer_context[layer].is_key_frame;
+    }
+    struct scale_factors *const sf =
+        scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame);
     av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
     av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                      xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
@@ -456,9 +471,11 @@
         cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
         &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
     frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
-    // Early exit for non-LAST frame if force_skip_low_temp_var is set.
-    if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 && !skip_pred_mv &&
-        !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+    // Early exit for non-LAST frame if force_skip_low_temp_var or
+    // ref_is_scaled is set.
+    if (bsize >= BLOCK_8X8 && !skip_pred_mv && !base_is_key_frame &&
+        !((ref_is_scaled || force_skip_low_temp_var) &&
+          ref_frame != LAST_FRAME)) {
       av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
                   bsize);
     }
@@ -467,6 +484,7 @@
     av1_count_overlappable_neighbors(cm, xd);
   }
   mbmi->num_proj_ref = 1;
+  *use_scaled_ref_frame = ref_is_scaled && scaled_ref;
 }
 
 static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 24a5264..1836869 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -179,8 +179,6 @@
  * \param[in]    x                        Pointer to structure holding all the
  *                                        data for the current macroblock
  * \param[in]    bsize                    Current block size
- * \param[in]    mi_row                   Row index in 4x4 units
- * \param[in]    mi_col                   Column index in 4x4 units
  * \param[in]    tmp_mv                   Pointer to best found New MV
  * \param[in]    rate_mv                  Pointer to Rate of the best new MV
  * \param[in]    best_rd_sofar            RD Cost of the best mode found so far
@@ -192,15 +190,13 @@
  * Rate estimation for this vector is placed to \c rate_mv
  */
 static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
-                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                  int_mv *tmp_mv, int *rate_mv,
-                                  int64_t best_rd_sofar, int use_base_mv) {
+                                  BLOCK_SIZE bsize, int_mv *tmp_mv,
+                                  int *rate_mv, int64_t best_rd_sofar,
+                                  int use_base_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
   const SPEED_FEATURES *sf = &cpi->sf;
   MB_MODE_INFO *mi = xd->mi[0];
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
   int step_param = (sf->rt_sf.fullpel_search_step_param)
                        ? sf->rt_sf.fullpel_search_step_param
                        : cpi->mv_search_params.mv_step_param;
@@ -212,19 +208,6 @@
   int rv = 0;
   int cost_list[5];
   int search_subpel = 1;
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-
-  if (scaled_ref_frame) {
-    int plane;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (plane = 0; plane < MAX_MB_PLANE; plane++)
-      backup_yv12[plane] = xd->plane[plane].pre[0];
-    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
-  }
 
   start_mv = get_fullmv_from_mv(&ref_mv);
 
@@ -233,17 +216,19 @@
   else
     center_mv = tmp_mv->as_mv;
 
-  const SEARCH_METHODS search_method = sf->mv_sf.search_method;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
   const search_site_config *src_search_sites =
       av1_get_search_site_config(cpi, x, search_method);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  FULLPEL_MV_STATS best_mv_stats;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     start_mv, src_search_sites,
+                                     start_mv, src_search_sites, search_method,
                                      /*fine_search_interval=*/0);
 
   const unsigned int full_var_rd = av1_full_pixel_search(
       start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
-      &tmp_mv->as_fullmv, NULL);
+      &tmp_mv->as_fullmv, &best_mv_stats, NULL);
 
   // calculate the bit cost on motion vector
   MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
@@ -272,22 +257,17 @@
     // adaptively downgrade subpel search method based on block properties
     if (use_aggressive_subpel_search_method(
             x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
-      av1_find_best_sub_pixel_tree_pruned_more(xd, cm, &ms_params,
-                                               subpel_start_mv, &tmp_mv->as_mv,
-                                               &dis, &x->pred_sse[ref], NULL);
+      av1_find_best_sub_pixel_tree_pruned_more(
+          xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+          &dis, &x->pred_sse[ref], NULL);
     else
       cpi->mv_search_params.find_fractional_mv_step(
-          xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
-          &x->pred_sse[ref], NULL);
+          xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+          &dis, &x->pred_sse[ref], NULL);
     *rate_mv =
         av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
                         x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
   }
-
-  if (scaled_ref_frame) {
-    for (int plane = 0; plane < MAX_MB_PLANE; plane++)
-      xd->plane[plane].pre[0] = backup_yv12[plane];
-  }
   // The final MV can not be equal to the reference MV as this will trigger an
   // assert later. This can happen if both NEAREST and NEAR modes were skipped.
   rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
@@ -331,6 +311,7 @@
   MB_MODE_INFO *const mi = xd->mi[0];
   AV1_COMMON *cm = &cpi->common;
   int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame];
+  unsigned int y_sad_zero;
   if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
       gf_temporal_ref) {
     int tmp_sad;
@@ -338,9 +319,12 @@
 
     if (bsize < BLOCK_16X16) return -1;
 
+    int me_search_size_col = block_size_wide[bsize] >> 1;
+    int me_search_size_row = block_size_high[bsize] >> 1;
     tmp_sad = av1_int_pro_motion_estimation(
         cpi, x, bsize, mi_row, mi_col,
-        &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv);
+        &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero,
+        me_search_size_col, me_search_size_row);
 
     if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
 
@@ -363,7 +347,7 @@
     MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
     cpi->mv_search_params.find_fractional_mv_step(
-        xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
+        xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis,
         &x->pred_sse[ref_frame], NULL);
     this_ref_frm_newmv->as_int = best_mv.as_int;
 
@@ -378,9 +362,8 @@
     *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv,
                                x->mv_costs->nmv_joint_cost,
                                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
-  } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                     &frame_mv[NEWMV][ref_frame], rate_mv,
-                                     best_rdc->rdcost, 0)) {
+  } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame],
+                                     rate_mv, best_rdc->rdcost, 0)) {
     return -1;
   }
 
@@ -1689,7 +1672,7 @@
 
 static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
                                               MB_MODE_INFO *mi, int mi_row,
-                                              int mi_col, int bsize,
+                                              int mi_col, BLOCK_SIZE bsize,
                                               int gf_temporal_ref,
                                               int use_ref_frame[],
                                               int *force_skip_low_temp_var) {
@@ -1779,7 +1762,7 @@
       x->nonrd_prune_ref_frame_search > 2 &&
       x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
       x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) {
-    int thr = (cm->width * cm->height >= 640 * 360) ? 100 : 150;
+    int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150;
     int pred = x->pred_mv_sad[LAST_FRAME] >>
                (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
     if (pred > thr) use_golden_ref_frame = 1;
@@ -1804,10 +1787,10 @@
   use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
   use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
   use_ref_frame[LAST_FRAME] = use_last_ref_frame;
-  // For now keep this assert on, but we should remove it for svc mode,
-  // as the user may want to generate an intra-only frame (no inter-modes).
-  // Remove this assert in subsequent CL when nonrd_pickmode is tested for the
-  // case of intra-only frame (no references enabled).
+  // Keep this assert on, as only 3 references are used in nonrd_pickmode
+  // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this
+  // frame must be an intra-only frame and hence should never enter the
+  // pickmode here for inter frames.
   assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
 }
 
@@ -1918,6 +1901,14 @@
   return 0;
 }
 
+static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                 struct buf_2d *yv12_mb) {
+  struct macroblock_plane *const p = &x->plane[0];
+  const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+                                                yv12_mb->buf, yv12_mb->stride);
+  if (y_sad == 0) x->block_is_zero_sad = 1;
+}
+
 static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int y_sad,
                                   unsigned int source_variance,
@@ -1925,14 +1916,33 @@
   const int subsampling_x = cpi->common.seq_params->subsampling_x;
   const int subsampling_y = cpi->common.seq_params->subsampling_y;
   const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+  const int high_res = cpi->common.width * cpi->common.height >= 640 * 360;
+  if (bsize == cpi->common.seq_params->sb_size) {
+    // At superblock level color_sensitivity is already set to 0, 1, or 2.
+    // 2 is middle/uncertain level. To avoid additional sad
+    // computations when bsize = sb_size force level 2 to 1 (certain color)
+    // for motion areas.
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) {
+      x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] =
+          source_sad_nonrd >= kMedSad ? 1 : 0;
+    }
+    if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) {
+      x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] =
+          source_sad_nonrd >= kMedSad ? 1 : 0;
+    }
+    return;
+  }
   int shift = 3;
-  if (source_sad_nonrd >= kMedSad &&
-      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
-      cpi->common.width * cpi->common.height >= 640 * 360)
+  unsigned int source_var_thr = 50;
+  int uv_sad_thr = 100;
+  if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res)
     shift = 4;
-  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-      cpi->rc.high_source_sad) {
-    shift = 6;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    if (cpi->rc.high_source_sad) shift = 6;
+    if (source_sad_nonrd > kMedSad) {
+      source_var_thr = 1200;
+      uv_sad_thr = 10;
+    }
   }
   NOISE_LEVEL noise_level = kLow;
   int norm_sad =
@@ -1953,8 +1963,12 @@
   const int num_planes = av1_num_planes(&cpi->common);
 
   for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) {
+    // Always check if level = 2. If level = 0 check again for
+    // motion areas for higher resolns, where color artifacts
+    // are more noticeable.
     if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 ||
-        source_variance < 50) {
+        (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 &&
+         source_sad_nonrd >= kMedSad && high_res)) {
       struct macroblock_plane *const p = &x->plane[plane];
       const BLOCK_SIZE bs =
           get_plane_block_size(bsize, subsampling_x, subsampling_y);
@@ -1966,7 +1980,7 @@
           uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
       x->color_sensitivity[COLOR_SENS_IDX(plane)] =
           uv_sad > (y_sad >> shift) && norm_uv_sad > 40;
-      if (source_variance < 50 && norm_uv_sad > 100)
+      if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr)
         x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1;
     }
   }
@@ -2213,9 +2227,8 @@
 // Function to setup parameters used for inter mode evaluation in non-rd.
 static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
     AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
-    RD_STATS *rd_cost, int *force_skip_low_temp_var, int *skip_pred_mv,
-    int mi_row, int mi_col, int gf_temporal_ref, unsigned char segment_id,
-    BLOCK_SIZE bsize
+    RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col,
+    int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize
 #if CONFIG_AV1_TEMPORAL_DENOISING
     ,
     PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode
@@ -2226,6 +2239,7 @@
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   MB_MODE_INFO *const mi = xd->mi[0];
   const ModeCosts *mode_costs = &x->mode_costs;
+  int skip_pred_mv = 0;
 
   // Initialize variance and distortion (chroma) for all modes and reference
   // frames
@@ -2272,20 +2286,21 @@
 #endif
 
   // Populate predicated motion vectors for LAST_FRAME
-  if (cpi->ref_frame_flags & AOM_LAST_FLAG)
+  if (cpi->ref_frame_flags & AOM_LAST_FLAG) {
     find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv,
                     search_state->yv12_mb, bsize, *force_skip_low_temp_var,
-                    x->force_zeromv_skip_for_blk);
-
+                    x->force_zeromv_skip_for_blk,
+                    &search_state->use_scaled_ref_frame[LAST_FRAME]);
+  }
   // Update mask to use all reference frame
   get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
                          search_state->use_ref_frame_mask,
                          force_skip_low_temp_var);
 
-  *skip_pred_mv = x->force_zeromv_skip_for_blk ||
-                  (x->nonrd_prune_ref_frame_search > 2 &&
-                   x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
-                   x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+  skip_pred_mv = x->force_zeromv_skip_for_blk ||
+                 (x->nonrd_prune_ref_frame_search > 2 &&
+                  x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+                  x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
 
   // Populate predicated motion vectors for other single reference frame
   // Start at LAST_FRAME + 1.
@@ -2294,7 +2309,8 @@
     if (search_state->use_ref_frame_mask[ref_frame_iter]) {
       find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv,
                       search_state->yv12_mb, bsize, *force_skip_low_temp_var,
-                      *skip_pred_mv);
+                      skip_pred_mv,
+                      &search_state->use_scaled_ref_frame[ref_frame_iter]);
     }
   }
 }
@@ -2334,6 +2350,27 @@
     *ref_frame2 = NONE_FRAME;
   }
 
+  if (x->sb_me_block && *ref_frame == LAST_FRAME) {
+    // We want to make sure to test the superblock MV:
+    // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
+    // have this sb MV. And don't skip NEWMV_LAST: this will be set to
+    // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't
+    // have it.
+    if (*this_mode == NEARESTMV &&
+        search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int ==
+            x->sb_me_mv.as_int) {
+      return false;
+    }
+    if (*this_mode == NEARMV &&
+        search_state->frame_mv[NEARMV][LAST_FRAME].as_int ==
+            x->sb_me_mv.as_int) {
+      return false;
+    }
+    if (*this_mode == NEWMV) {
+      return false;
+    }
+  }
+
   // Skip the single reference mode for which mode check flag is set.
   if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
     return true;
@@ -2397,13 +2434,12 @@
       get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
     return true;
 
-  // For screen content: for base spatial layer only for now.
-  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-      cpi->svc.spatial_layer_id == 0) {
+  // For screen content: skip mode testing based on source_sad.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
     // If source_sad is computed: skip non-zero motion
     // check for stationary (super)blocks. Otherwise if superblock
-    // has motion skip the modes with zero motion for flat blocks,
-    // and color is not set.
+    // has motion skip the modes with zero motion on last reference
+    // for flat blocks, and color is not set.
     // For the latter condition: the same condition should apply
     // to newmv if (0, 0), so this latter condition is repeated
     // below after search_new_mv.
@@ -2411,9 +2447,9 @@
       if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
            x->content_state_sb.source_sad_nonrd == kZeroSad) ||
           (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
-           x->content_state_sb.source_sad_nonrd != kZeroSad &&
-           ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
-             x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+           x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME &&
+           ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+             x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
             cpi->rc.high_source_sad) &&
            x->source_variance == 0))
         return true;
@@ -2479,10 +2515,11 @@
 #if CONFIG_AV1_TEMPORAL_DENOISING
     int64_t *zero_last_cost_orig, int denoise_svc_pickmode,
 #endif
-    int idx, int force_mv_inter_layer, int is_single_pred, int skip_pred_mv,
-    int gf_temporal_ref, int use_model_yrd_large, int filter_search_enabled_blk,
-    BLOCK_SIZE bsize, PREDICTION_MODE this_mode, InterpFilter filt_select,
-    int cb_pred_filter_search, int reuse_inter_pred) {
+    int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref,
+    int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize,
+    PREDICTION_MODE this_mode, InterpFilter filt_select,
+    int cb_pred_filter_search, int reuse_inter_pred,
+    int *sb_me_has_been_tested) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
@@ -2511,7 +2548,10 @@
   RD_STATS nonskip_rdc;
   av1_invalid_rd_stats(&nonskip_rdc);
 
-  if (this_mode == NEWMV && !force_mv_inter_layer) {
+  if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) {
+    // Set the NEWMV_LAST to the sb MV.
+    search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int;
+  } else if (this_mode == NEWMV && !force_mv_inter_layer) {
 #if COLLECT_NONRD_PICK_MODE_STAT
     aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
 #endif
@@ -2552,13 +2592,13 @@
   if (skip_this_mv && is_single_pred) return true;
 
   // For screen: for spatially flat blocks with non-zero motion,
-  // skip newmv if the motion vector is (0, 0), and color is not set.
+  // skip newmv if the motion vector is (0, 0)-LAST, and color is not set.
   if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
       cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) {
-    if (this_mv->as_int == 0 &&
-        x->content_state_sb.source_sad_nonrd != kZeroSad &&
-        ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
-          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+    if (this_mv->as_int == 0 && ref_frame == LAST_FRAME &&
+        x->block_is_zero_sad == 0 &&
+        ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+          x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
          cpi->rc.high_source_sad) &&
         x->source_variance == 0)
       return true;
@@ -2581,32 +2621,6 @@
     }
   }
 
-  if (idx == 0 && !skip_pred_mv) {
-    // Set color sensitivity on first tested mode only.
-    // Use y-sad already computed in find_predictors: take the sad with motion
-    // vector closest to 0; the uv-sad computed below in set_color_sensitivity
-    // is for zeromv.
-    // For screen: first check if golden reference is being used, if so,
-    // force color_sensitivity on if the color sensitivity for sb_g is on.
-    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
-        search_state->use_ref_frame_mask[GOLDEN_FRAME]) {
-      if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
-        x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
-      if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
-        x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
-    } else {
-      int y_sad = x->pred_mv0_sad[LAST_FRAME];
-      if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
-          (abs(search_state->frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
-           abs(search_state->frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
-              (abs(search_state->frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
-               abs(search_state->frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
-        y_sad = x->pred_mv1_sad[LAST_FRAME];
-      set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
-                            search_state->yv12_mb[LAST_FRAME]);
-    }
-  }
-
   mi->motion_mode = SIMPLE_TRANSLATION;
 #if !CONFIG_REALTIME_ONLY
   if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
@@ -2786,6 +2800,8 @@
       // Compute sse for chroma planes.
       const int64_t sse_uv = av1_model_rd_for_sb_uv(
           cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V);
+      if (rdc_uv.dist < x->min_dist_inter_uv)
+        x->min_dist_inter_uv = rdc_uv.dist;
       search_state->this_rdc.sse += sse_uv;
       // Restore Y rdc if UV rdc disallows txfm skip
       if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
@@ -2875,6 +2891,11 @@
       aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
 #endif
 
+  if (x->sb_me_block && ref_frame == LAST_FRAME &&
+      search_state->frame_mv[this_best_mode][ref_frame].as_int ==
+          x->sb_me_mv.as_int)
+    *sb_me_has_been_tested = 1;
+
   // Copy best mode params to search state
   if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
     search_state->best_rdc = search_state->this_rdc;
@@ -2900,7 +2921,7 @@
 
   if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) {
     txfm_info->skip_txfm = 1;
-    return false;
+    if (!x->sb_me_block || *sb_me_has_been_tested) return false;
   }
   return true;
 }
@@ -2960,6 +2981,8 @@
       }
       av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U,
                              AOM_PLANE_V);
+      if (rdc_uv.dist < x->min_dist_inter_uv)
+        x->min_dist_inter_uv = rdc_uv.dist;
       idtx_rdc.rate += rdc_uv.rate;
       idtx_rdc.dist += rdc_uv.dist;
       idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm;
@@ -3071,7 +3094,6 @@
   int best_early_term = 0;
   int force_skip_low_temp_var = 0;
   unsigned int sse_zeromv_norm = UINT_MAX;
-  int skip_pred_mv = 0;
   const int num_inter_modes = NUM_INTER_MODES;
   const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
   bool check_globalmv = rt_sf->check_globalmv_on_single_ref;
@@ -3082,6 +3104,7 @@
       rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8;
   InterModeSearchStateNonrd search_state;
   av1_zero(search_state.use_ref_frame_mask);
+  av1_zero(search_state.use_scaled_ref_frame);
   BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode;
   (void)tile_data;
 
@@ -3111,7 +3134,9 @@
   const int resize_pending = is_frame_resize_pending(cpi);
 #endif
   const ModeCosts *mode_costs = &x->mode_costs;
-
+  struct scale_factors sf_no_scale;
+  av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+                                    cm->width, cm->height);
   if (reuse_inter_pred) {
     for (int buf_idx = 0; buf_idx < 3; buf_idx++) {
       tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx];
@@ -3130,7 +3155,9 @@
   // to source, so use subpel motion vector to compensate. The nonzero motion
   // is half pixel shifted to left and top, so (-4, -4). This has more effect
   // on higher resolutions, so condition it on that for now.
+  // Exclude quality layers, which have the same resolution and hence no shift.
   if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+      !svc->has_lower_quality_layer &&
       svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
       cm->width * cm->height > 640 * 480) {
     svc_mv.as_mv.row = -4;
@@ -3138,12 +3165,12 @@
   }
 
   // Setup parameters used for inter mode evaluation.
-  set_params_nonrd_pick_inter_mode(
-      cpi, x, &search_state, rd_cost, &force_skip_low_temp_var, &skip_pred_mv,
-      mi_row, mi_col, gf_temporal_ref, segment_id, bsize
+  set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost,
+                                   &force_skip_low_temp_var, mi_row, mi_col,
+                                   gf_temporal_ref, segment_id, bsize
 #if CONFIG_AV1_TEMPORAL_DENOISING
-      ,
-      ctx, denoise_svc_pickmode
+                                   ,
+                                   ctx, denoise_svc_pickmode
 #endif
   );
 
@@ -3207,6 +3234,28 @@
   inter_pred_params_sr.conv_params =
       get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
 
+  x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad;
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      !x->force_zeromv_skip_for_blk &&
+      x->content_state_sb.source_sad_nonrd != kZeroSad &&
+      x->source_variance == 0 && bsize < cm->seq_params->sb_size &&
+      search_state.yv12_mb[LAST_FRAME][0].width == cm->width &&
+      search_state.yv12_mb[LAST_FRAME][0].height == cm->height) {
+    set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]);
+  }
+
+  int sb_me_has_been_tested = 0;
+  x->sb_me_block = x->sb_me_partition;
+  // Only use this feature (force testing of superblock motion) if coding
+  // block size is large.
+  if (x->sb_me_block) {
+    if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64)
+      x->sb_me_block = 0;
+    else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32)
+      x->sb_me_block = 0;
+  }
+
+  x->min_dist_inter_uv = INT64_MAX;
   for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
     // If we are at the first compound mode, and the single modes already
     // perform well, then end the search.
@@ -3218,6 +3267,36 @@
     int is_single_pred = 1;
     PREDICTION_MODE this_mode;
 
+    if (idx == 0 && !x->force_zeromv_skip_for_blk) {
+      // Set color sensitivity on first tested mode only.
+      // Use y-sad already computed in find_predictors: take the sad with motion
+      // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+      // is for zeromv.
+      // For screen: first check if golden reference is being used, if so,
+      // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1.
+      // The check in set_color_sensitivity() will then follow and check for
+      // setting the flag if the level is still 2 or 0.
+      if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+          search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
+        if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
+        if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+          x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
+      }
+      if (search_state.use_ref_frame_mask[LAST_FRAME] &&
+          x->pred_mv0_sad[LAST_FRAME] != INT_MAX) {
+        int y_sad = x->pred_mv0_sad[LAST_FRAME];
+        if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+            (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+             abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+                (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+                 abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+          y_sad = x->pred_mv1_sad[LAST_FRAME];
+        set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
+                              search_state.yv12_mb[LAST_FRAME]);
+      }
+    }
+
     // Check the inter mode can be skipped based on mode statistics and speed
     // features settings.
     if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred,
@@ -3239,6 +3318,16 @@
     mi->ref_frame[1] = ref_frame2;
     set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
 
+    // Check if the scaled reference frame should be used. This is set in the
+    // find_predictors() for each usable reference. If so, set the
+    // block_ref_scale_factors[] to no reference scaling.
+    if (search_state.use_scaled_ref_frame[ref_frame]) {
+      xd->block_ref_scale_factors[0] = &sf_no_scale;
+    }
+    if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) {
+      xd->block_ref_scale_factors[1] = &sf_no_scale;
+    }
+
     // Perform inter mode evaluation for non-rd
     if (!handle_inter_mode_nonrd(
             cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer,
@@ -3247,10 +3336,10 @@
 #if CONFIG_AV1_TEMPORAL_DENOISING
             &zero_last_cost_orig, denoise_svc_pickmode,
 #endif
-            idx, force_mv_inter_layer, is_single_pred, skip_pred_mv,
-            gf_temporal_ref, use_model_yrd_large, filter_search_enabled_blk,
-            bsize, this_mode, filt_select, cb_pred_filter_search,
-            reuse_inter_pred)) {
+            idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref,
+            use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode,
+            filt_select, cb_pred_filter_search, reuse_inter_pred,
+            &sb_me_has_been_tested)) {
       break;
     }
   }
@@ -3292,8 +3381,8 @@
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
       x->content_state_sb.source_sad_nonrd != kZeroSad &&
       bsize <= BLOCK_16X16) {
-    unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 250000;
-    unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 1000;
+    unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000;
+    unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200;
     unsigned int best_sse_inter_motion =
         (unsigned int)(search_state.best_rdc.sse >>
                        (b_width_log2_lookup[bsize] +
@@ -3324,7 +3413,7 @@
       try_palette &&
       (is_mode_intra(best_pickmode->best_mode) || force_palette_test) &&
       x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
-      (cpi->rc.high_source_sad || x->source_variance > 500);
+      (cpi->rc.high_source_sad || x->source_variance > 300);
 
   if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0;
 
@@ -3360,6 +3449,14 @@
 
   if (!is_inter_block(mi)) {
     mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+  } else {
+    // If inter mode is selected and ref_frame was one that uses the
+    // scaled reference frame, then we can't use reuse_inter_pred.
+    if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] ||
+        (has_second_ref(mi) &&
+         search_state
+             .use_scaled_ref_frame[best_pickmode->best_second_ref_frame]))
+      x->reuse_inter_pred = 0;
   }
 
   // Restore the predicted samples of best mode to final buffer
@@ -3425,4 +3522,9 @@
 #endif  // COLLECT_NONRD_PICK_MODE_STAT
 
   *rd_cost = search_state.best_rdc;
+
+  // Reset the xd->block_ref_scale_factors[i], as they may have
+  // been set to pointer &sf_no_scale, which becomes invalid afer
+  // this function.
+  set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
 }
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index b1a73e4..7f79e95 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -564,7 +564,24 @@
   }
 
   uint8_t *const color_map = xd->plane[0].color_index_map;
-  if (colors_threshold > 1 && colors_threshold <= 64) {
+  int color_thresh_palette = 64;
+  // Allow for larger color_threshold for palette search, based on color,
+  // scene_change, and block source variance.
+  // Since palette is Y based, only allow larger threshold if block
+  // color_dist is below threshold.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad &&
+      x->source_variance > 50) {
+    int64_t norm_color_dist = 0;
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      norm_color_dist = x->min_dist_inter_uv >>
+                        (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+      if (x->color_sensitivity[0] && x->color_sensitivity[1])
+        norm_color_dist = norm_color_dist >> 1;
+    }
+    if (norm_color_dist < 8000) color_thresh_palette += 20;
+  }
+  if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) {
     int16_t *const data = x->palette_buffer->kmeans_data_buf;
     int16_t centroids[PALETTE_MAX_SIZE];
     int lower_bound, upper_bound;
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 96567dd..1e3d980 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -40,7 +40,6 @@
 #endif
 
 #define COLLECT_MOTION_SEARCH_FEATURE_SB 0
-#define ML_PARTITION_WHOLE_TREE_DECISION 0
 
 void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->partition_search_type = SEARCH_PARTITION;
@@ -73,6 +72,7 @@
   part_sf->intra_cnn_based_part_prune_level = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
   part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+  part_sf->ext_part_eval_based_on_cur_best = 0;
   part_sf->prune_ext_part_using_split_info = 0;
   part_sf->prune_rectangular_split_based_on_qidx = 0;
   part_sf->early_term_after_none_split = 0;
@@ -1772,6 +1772,9 @@
 
   if (pc_tree->none == NULL) {
     pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
   }
   PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
 
@@ -1832,6 +1835,9 @@
 
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    if (!pc_tree->split[i])
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
     pc_tree->split[i]->index = i;
   }
   switch (partition) {
@@ -1848,6 +1854,9 @@
       for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
         pc_tree->horizontal[i] =
             av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        if (!pc_tree->horizontal[i])
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
       }
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                     PARTITION_HORZ, subsize, pc_tree->horizontal[0],
@@ -1881,6 +1890,9 @@
       for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
         pc_tree->vertical[i] =
             av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        if (!pc_tree->vertical[i])
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
       }
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                     PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
@@ -1981,6 +1993,9 @@
       if (pc_tree->split[i]->none == NULL)
         pc_tree->split[i]->none =
             av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf);
+      if (!pc_tree->split[i]->none)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PICK_MODE_CONTEXT");
       pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
                     PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
                     invalid_rdc);
@@ -2264,10 +2279,13 @@
   x->force_zeromv_skip_for_blk =
       get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
 
-  if (!x->force_zeromv_skip_for_blk) {
+  // Source variance may be already compute at superblock level, so no need
+  // to recompute, unless bsize < sb_size or source_variance is not yet set.
+  if (!x->force_zeromv_skip_for_blk &&
+      (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
     x->source_variance = av1_get_perpixel_variance_facade(
         cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
-  }
+
   // Save rdmult before it might be changed, so it can be restored later.
   const int orig_rdmult = x->rdmult;
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
@@ -2396,6 +2414,9 @@
   av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   if (!pc_tree->none) {
     pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
   } else {
     av1_reset_pmc(pc_tree->none);
   }
@@ -2416,6 +2437,9 @@
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     if (!pc_tree->split[i]) {
       pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+      if (!pc_tree->split[i])
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
     }
     pc_tree->split[i]->index = i;
   }
@@ -2434,6 +2458,9 @@
     if (!pc_tree->split[i]->none) {
       pc_tree->split[i]->none =
           av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+      if (!pc_tree->split[i]->none)
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PICK_MODE_CONTEXT");
     } else {
       av1_reset_pmc(pc_tree->split[i]->none);
     }
@@ -2568,6 +2595,9 @@
   pc_tree->partitioning = PARTITION_NONE;
   if (!pc_tree->none) {
     pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
   } else {
     av1_reset_pmc(pc_tree->none);
   }
@@ -2599,6 +2629,9 @@
         if (!pc_tree->split[i]->none) {
           pc_tree->split[i]->none =
               av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+          if (!pc_tree->split[i]->none)
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PICK_MODE_CONTEXT");
         } else {
           av1_reset_pmc(pc_tree->split[i]->none);
         }
@@ -2659,6 +2692,9 @@
       if (!pc_tree->split[i]->none) {
         pc_tree->split[i]->none =
             av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        if (!pc_tree->split[i]->none)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
       }
       encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
                      subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
@@ -2762,6 +2798,7 @@
   struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
   int force_skip_low_temp_var = 0;
   int skip_pred_mv = 0;
+  bool use_scaled_ref;
 
   for (int i = 0; i < MB_MODE_COUNT; ++i) {
     for (int j = 0; j < REF_FRAMES; ++j) {
@@ -2774,7 +2811,7 @@
                   x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
 
   find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize,
-                  force_skip_low_temp_var, skip_pred_mv);
+                  force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
 
   int continue_merging = 1;
   if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row ||
@@ -2790,7 +2827,7 @@
     av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
                                        mi_col, this_mi[0]->bsize);
     find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize,
-                    force_skip_low_temp_var, skip_pred_mv);
+                    force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
   } else {
     struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
     const int is_scaled = av1_is_scaled(sf);
@@ -2945,6 +2982,9 @@
     case PARTITION_NONE:
       if (!pc_tree->none) {
         pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+        if (!pc_tree->none)
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
       } else {
         av1_reset_pmc(pc_tree->none);
       }
@@ -2958,6 +2998,9 @@
         if (!pc_tree->vertical[i]) {
           pc_tree->vertical[i] =
               av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+          if (!pc_tree->vertical[i])
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PICK_MODE_CONTEXT");
         } else {
           av1_reset_pmc(pc_tree->vertical[i]);
         }
@@ -2978,6 +3021,9 @@
         if (!pc_tree->horizontal[i]) {
           pc_tree->horizontal[i] =
               av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+          if (!pc_tree->horizontal[i])
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PICK_MODE_CONTEXT");
         } else {
           av1_reset_pmc(pc_tree->horizontal[i]);
         }
@@ -2998,6 +3044,9 @@
       for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
         if (!pc_tree->split[i]) {
           pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+          if (!pc_tree->split[i])
+            aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PC_TREE");
         }
         pc_tree->split[i]->index = i;
       }
@@ -3494,6 +3543,9 @@
       if (cur_ctx[i][j][0] == NULL) {
         cur_ctx[i][j][0] =
             av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf);
+        if (!cur_ctx[i][j][0])
+          aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PICK_MODE_CONTEXT");
       }
     }
     sum_rdc->rate = part_search_state->partition_cost[partition_type];
@@ -3573,7 +3625,7 @@
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
   const int mi_row = blk_params.mi_row;
   const int mi_col = blk_params.mi_col;
-  const int bsize = blk_params.bsize;
+  const BLOCK_SIZE bsize = blk_params.bsize;
   int64_t this_rdcost = 0;
 
 #if CONFIG_COLLECT_PARTITION_STATS
@@ -3683,7 +3735,7 @@
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
   const int mi_row = blk_params.mi_row;
   const int mi_col = blk_params.mi_col;
-  const int bsize = blk_params.bsize;
+  const BLOCK_SIZE bsize = blk_params.bsize;
 
   if (part_search_state->terminate_partition_search) {
     return;
@@ -3759,6 +3811,9 @@
       // Set AB partition context.
       cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
           cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+      if (!cur_part_ctxs[ab_part_type][i])
+        aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PICK_MODE_CONTEXT");
       // Set mode as not ready.
       cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
     }
@@ -3818,8 +3873,12 @@
       part_search_state->partition_cost[partition_type];
   part_search_state->sum_rdc.rdcost =
       RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
-  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i)
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
     cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+    if (!cur_part_ctx[i])
+      aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
+  }
 }
 
 // Partition search of HORZ4 / VERT4 partition types.
@@ -3882,6 +3941,50 @@
                       blk_params.bsize, av1_num_planes(cm));
 }
 
+// Do not evaluate extended partitions if NONE partition is skippable.
+static INLINE int prune_ext_part_none_skippable(
+    PICK_MODE_CONTEXT *part_none, int must_find_valid_partition,
+    int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) {
+  if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) {
+    if (part_none->skippable && !must_find_valid_partition &&
+        bsize >= BLOCK_16X16) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+// Allow ab partition search
+static int allow_ab_partition_search(PartitionSearchState *part_search_state,
+                                     PARTITION_SPEED_FEATURES *part_sf,
+                                     PARTITION_TYPE curr_best_part,
+                                     int must_find_valid_partition,
+                                     int prune_ext_part_state,
+                                     int64_t best_rdcost) {
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  // Do not prune if there is no valid partition
+  if (best_rdcost == INT64_MAX) return 1;
+
+  // Determine bsize threshold to evaluate ab partitions
+  BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh;
+  if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition &&
+      !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT))
+    ab_bsize_thresh = BLOCK_128X128;
+
+  // ab partitions are only allowed for square block sizes BLOCK_16X16 or
+  // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and
+  // BLOCK_8X8.
+  assert(ab_bsize_thresh >= BLOCK_8X8);
+
+  int ab_partition_allowed =
+      part_search_state->do_rectangular_split && bsize > ab_bsize_thresh &&
+      av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+  return ab_partition_allowed;
+}
+
 // Prune 4-way partitions based on the number of horz/vert wins
 // in the current block and sub-blocks in PARTITION_SPLIT.
 static void prune_4_partition_using_split_info(
@@ -3915,9 +4018,28 @@
 static void prune_4_way_partition_search(
     AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
     PartitionSearchState *part_search_state, RD_STATS *best_rdc,
-    int pb_source_variance, int ext_partition_allowed,
+    int pb_source_variance, int prune_ext_part_state,
     int part4_search_allowed[NUM_PART4_TYPES]) {
-  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+
+  // Do not prune if there is no valid partition
+  if (best_rdc->rdcost == INT64_MAX) return;
+
+  // Determine bsize threshold to evaluate 4-way partitions
+  BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh;
+  if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best &&
+      !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE)
+    part4_bsize_thresh = BLOCK_128X128;
+
+  // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and
+  // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude
+  // BLOCK_4X4 and BLOCK_8X8.
+  assert(part4_bsize_thresh >= BLOCK_8X8);
+
+  bool partition4_allowed =
+      part_search_state->do_rectangular_split && bsize > part4_bsize_thresh &&
+      av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
 
   // Disable 4-way partition search flags for width less than a multiple of the
   // minimum partition width.
@@ -3928,17 +4050,15 @@
     return;
   }
 
-  const int bsize = blk_params.bsize;
   PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
                                                PARTITION_VERT_4 };
   const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
   // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
   // PARTITION_VERT_4 for this block. This is almost the same as
-  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+  // partition4_allowed, except that we don't allow 128x32 or 32x128
   // blocks, so we require that bsize is not BLOCK_128X128.
-  const int partition4_allowed = part_cfg->enable_1to4_partitions &&
-                                 ext_partition_allowed &&
-                                 bsize != BLOCK_128X128;
+  partition4_allowed &=
+      part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128;
 
   for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
     part4_search_allowed[i] =
@@ -3988,6 +4108,9 @@
   // Set PARTITION_NONE context.
   if (pc_tree->none == NULL)
     pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf);
+  if (!pc_tree->none)
+    aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PICK_MODE_CONTEXT");
 
   // Set PARTITION_NONE type cost.
   if (part_search_state->partition_none_allowed) {
@@ -4215,7 +4338,7 @@
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_row = blk_params.mi_row;
   const int mi_col = blk_params.mi_col;
-  const int bsize = blk_params.bsize;
+  const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   RD_STATS sum_rdc = part_search_state->sum_rdc;
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
@@ -4228,6 +4351,9 @@
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     if (pc_tree->split[i] == NULL)
       pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    if (!pc_tree->split[i])
+      aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
     pc_tree->split[i]->index = i;
   }
 
@@ -4400,6 +4526,7 @@
   fclose(pfile);
 }
 
+#if CONFIG_PARTITION_SEARCH_ORDER
 static void verify_write_partition_tree(const AV1_COMP *const cpi,
                                         const PC_TREE *const pc_tree,
                                         const BLOCK_SIZE bsize,
@@ -4463,6 +4590,7 @@
 }
 
 static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+                               struct aom_internal_error_info *error_info,
                                const int config_id) {
   const AV1_COMMON *const cm = &cpi->common;
   const char *path = cpi->oxcf.partition_info_path;
@@ -4502,6 +4630,9 @@
       for (int i = 0; i < 4; ++i) {
         if (node != NULL) {  // Suppress warning
           node->split[i] = av1_alloc_pc_tree_node(subsize);
+          if (!node->split[i])
+            aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PC_TREE");
           node->split[i]->index = i;
           tree_node_queue[last_idx] = node->split[i];
           ++last_idx;
@@ -4665,7 +4796,8 @@
 
 static void build_pc_tree_from_part_decision(
     const aom_partition_decision_t *partition_decision,
-    const BLOCK_SIZE this_bsize, PC_TREE *pc_tree) {
+    const BLOCK_SIZE this_bsize, PC_TREE *pc_tree,
+    struct aom_internal_error_info *error_info) {
   BLOCK_SIZE bsize = this_bsize;
   int num_nodes = partition_decision->num_nodes;
   PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
@@ -4686,6 +4818,9 @@
       for (int i = 0; i < 4; ++i) {
         if (node != NULL) {  // Suppress warning
           node->split[i] = av1_alloc_pc_tree_node(subsize);
+          if (!node->split[i])
+            aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate PC_TREE");
           node->split[i]->index = i;
           tree_node_queue[last_idx] = node->split[i];
           ++last_idx;
@@ -4707,6 +4842,7 @@
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  struct aom_internal_error_info *error_info = x->e_mbd.error_info;
   aom_partition_features_t features;
   prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
                                     &features);
@@ -4716,7 +4852,6 @@
   features.frame_height = cpi->frame_info.frame_height;
   features.block_size = bsize;
   av1_ext_part_send_features(ext_part_controller, &features);
-  PC_TREE *pc_tree;
 
   // rd mode search (dry run) for a valid partition decision from the ml model.
   aom_partition_decision_t partition_decision;
@@ -4728,26 +4863,32 @@
     // First, let's take the easy approach.
     // We require that the ml model has to provide partition decisions for the
     // whole superblock.
-    pc_tree = av1_alloc_pc_tree_node(bsize);
-    build_pc_tree_from_part_decision(&partition_decision, bsize, pc_tree);
+    td->pc_root = av1_alloc_pc_tree_node(bsize);
+    if (!td->pc_root)
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root,
+                                     error_info);
 
     const RD_STATS this_rdcost = rd_search_for_fixed_partition(
-        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
     aom_partition_stats_t stats;
     update_partition_stats(&this_rdcost, &stats);
     av1_ext_part_send_partition_stats(ext_part_controller, &stats);
     if (!partition_decision.is_final_decision) {
-      av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+      av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                                  cpi->sf.part_sf.partition_search_type);
+      td->pc_root = NULL;
     }
   } while (!partition_decision.is_final_decision);
 
   // Encode with the selected mode and partition.
   set_cb_offsets(x->cb_offset, 0, 0);
   encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-            pc_tree, NULL);
-  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+            td->pc_root, NULL);
+  av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                              cpi->sf.part_sf.partition_search_type);
+  td->pc_root = NULL;
 
   return true;
 }
@@ -4978,6 +5119,9 @@
         av1_init_rd_stats(&split_rdc[i]);
         if (pc_tree->split[i] == NULL)
           pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+        if (!pc_tree->split[i])
+          aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                             "Failed to allocate PC_TREE");
         pc_tree->split[i]->index = i;
       }
       const int orig_rdmult_tmp = x->rdmult;
@@ -5050,12 +5194,14 @@
   features.frame_height = cpi->frame_info.frame_height;
   features.block_size = bsize;
   av1_ext_part_send_features(ext_part_controller, &features);
-  PC_TREE *pc_tree;
-  pc_tree = av1_alloc_pc_tree_node(bsize);
+  td->pc_root = av1_alloc_pc_tree_node(bsize);
+  if (!td->pc_root)
+    aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PC_TREE");
 
   RD_STATS rdcost;
   const bool valid_partition =
-      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree, mi_row,
+      recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row,
                           mi_col, bsize, &rdcost);
   if (!valid_partition) {
     return false;
@@ -5064,9 +5210,10 @@
   // Encode with the selected mode and partition.
   set_cb_offsets(x->cb_offset, 0, 0);
   encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-            pc_tree, NULL);
-  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+            td->pc_root, NULL);
+  av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                              cpi->sf.part_sf.partition_search_type);
+  td->pc_root = NULL;
 
   return true;
 }
@@ -5100,54 +5247,65 @@
   }
 
   MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int best_idx = 0;
   int64_t min_rdcost = INT64_MAX;
   int num_configs;
-  RD_STATS *rdcost = NULL;
   int i = 0;
   do {
-    PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
-    num_configs = read_partition_tree(cpi, pc_tree, i);
-    if (i == 0) {
-      CHECK_MEM_ERROR(cm, rdcost, aom_calloc(num_configs, sizeof(*rdcost)));
-    }
+    td->pc_root = av1_alloc_pc_tree_node(bsize);
+    if (!td->pc_root)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PC_TREE");
+    num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i);
     if (num_configs <= 0) {
-      av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+      av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                                  cpi->sf.part_sf.partition_search_type);
-      if (rdcost != NULL) aom_free(rdcost);
-      aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid configs.");
+      td->pc_root = NULL;
+      aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs.");
     }
-    verify_write_partition_tree(cpi, pc_tree, bsize, i, mi_row, mi_col);
+    verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col);
+    if (i == 0) {
+      AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost,
+                          aom_calloc(num_configs, sizeof(*x->rdcost)));
+    }
     // Encode the block with the given partition tree. Get rdcost and encoding
     // time.
-    rdcost[i] = rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root,
-                                              mi_row, mi_col, bsize, pc_tree);
+    x->rdcost[i] = rd_search_for_fixed_partition(
+        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
 
-    if (rdcost[i].rdcost < min_rdcost) {
-      min_rdcost = rdcost[i].rdcost;
+    if (x->rdcost[i].rdcost < min_rdcost) {
+      min_rdcost = x->rdcost[i].rdcost;
       best_idx = i;
-      *best_rd_cost = rdcost[i];
+      *best_rd_cost = x->rdcost[i];
     }
-    av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+    av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                                cpi->sf.part_sf.partition_search_type);
+    td->pc_root = NULL;
     ++i;
   } while (i < num_configs);
 
+  aom_free(x->rdcost);
+  x->rdcost = NULL;
   // Encode with the partition configuration with the smallest rdcost.
-  PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
-  read_partition_tree(cpi, pc_tree, best_idx);
+  td->pc_root = av1_alloc_pc_tree_node(bsize);
+  if (!td->pc_root)
+    aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate PC_TREE");
+  read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx);
   rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
-                                mi_col, bsize, pc_tree);
+                                mi_col, bsize, td->pc_root);
   set_cb_offsets(x->cb_offset, 0, 0);
   encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-            pc_tree, NULL);
-  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0,
+            td->pc_root, NULL);
+  av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
                              cpi->sf.part_sf.partition_search_type);
-  aom_free(rdcost);
+  td->pc_root = NULL;
   ++cpi->sb_counter;
 
   return true;
 }
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
 
 static AOM_INLINE bool should_do_dry_run_encode_for_current_block(
     BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index,
@@ -5493,25 +5651,21 @@
   assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
                  !part_search_state.do_rectangular_split));
 
-  int ext_partition_allowed =
-      part_search_state.do_rectangular_split &&
-      bsize > cpi->sf.part_sf.ext_partition_eval_thresh &&
-      av1_blk_has_rows_and_cols(&blk_params);
+  const int prune_ext_part_state = prune_ext_part_none_skippable(
+      pc_tree->none, x->must_find_valid_partition,
+      cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize);
 
-  // Do not evaluate extended partitions if NONE partition is skippable.
-  if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 1) &&
-      (pc_tree->none != NULL)) {
-    if (pc_tree->none->skippable && !x->must_find_valid_partition &&
-        bsize >= BLOCK_16X16)
-      ext_partition_allowed = 0;
-  }
+  const int ab_partition_allowed = allow_ab_partition_search(
+      &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning,
+      x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost);
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, ab_partitions_search_time);
 #endif
   // AB partitions search stage.
   ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
                        &part_search_state, &best_rdc, rect_part_win_info,
-                       pb_source_variance, ext_partition_allowed, HORZ_A,
+                       pb_source_variance, ab_partition_allowed, HORZ_A,
                        VERT_B);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, ab_partitions_search_time);
@@ -5521,7 +5675,7 @@
   int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
   // Prune 4-way partition search.
   prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
-                               pb_source_variance, ext_partition_allowed,
+                               pb_source_variance, prune_ext_part_state,
                                part4_search_allowed);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -5618,9 +5772,12 @@
       set_cb_offsets(x->cb_offset, 0, 0);
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
                 pc_tree, NULL);
+      assert(pc_tree == td->pc_root);
       // Dealloc the whole PC_TREE after a superblock is done.
       av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0,
                                  cpi->sf.part_sf.partition_search_type);
+      pc_tree = NULL;
+      td->pc_root = NULL;
       pc_tree_dealloc = 1;
     } else if (should_do_dry_run_encode_for_current_block(
                    cm->seq_params->sb_size, x->sb_enc.max_partition_size,
@@ -5941,6 +6098,9 @@
   // PARTITION_NONE
   if (partition_none_allowed) {
     pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    if (!pc_tree->none)
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate PICK_MODE_CONTEXT");
     PICK_MODE_CONTEXT *ctx = pc_tree->none;
 
 // Flip for RDO based pick mode
@@ -5974,6 +6134,9 @@
 
     for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
       pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+      if (!pc_tree->split[i])
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate PC_TREE");
       pc_tree->split[i]->index = i;
     }
 
diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
index 2577e79..1b5d71b 100644
--- a/av1/encoder/partition_search.h
+++ b/av1/encoder/partition_search.h
@@ -42,11 +42,14 @@
 void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
 void av1_reset_sf_for_ext_part(AV1_COMP *const cpi);
 
+#if CONFIG_PARTITION_SEARCH_ORDER
 bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
                              TileDataEnc *tile_data, TokenExtra **tp,
                              SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
                              int mi_col, BLOCK_SIZE bsize,
                              RD_STATS *best_rd_cost);
+#endif
+
 bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                            TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
                            int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 080587b..ce06313 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -108,7 +108,7 @@
                                    const bool is_test_mode,
                                    const float *features,
                                    const int feature_size, const int id,
-                                   const int bsize, const int mi_row,
+                                   const BLOCK_SIZE bsize, const int mi_row,
                                    const int mi_col) {
   if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
 
@@ -118,7 +118,8 @@
   FILE *pfile = fopen(filename, "a");
   if (pfile == NULL) return;
   if (!is_test_mode) {
-    fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size);
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col,
+            feature_size);
   }
   for (int i = 0; i < feature_size; ++i) {
     fprintf(pfile, "%.6f", features[i]);
@@ -203,7 +204,7 @@
       if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
                                                 cnn_config, &thread_data,
                                                 bit_depth, &output)) {
-        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
                            "Error allocating CNN data");
         return;
       }
@@ -212,7 +213,7 @@
 
       if (!av1_cnn_predict_img_multi_out(image, width, height, stride,
                                          cnn_config, &thread_data, &output)) {
-        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
                            "Error allocating CNN data");
         return;
       }
@@ -471,8 +472,6 @@
 
   // Otherwise do loop through the reference frames and find the one with the
   // minimum SSE
-  const MACROBLOCKD *xd = &x->e_mbd;
-
   const int num_planes = 1;
 
   *best_sse = INT_MAX;
@@ -483,12 +482,9 @@
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
       const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
       unsigned int curr_sse = 0, curr_var = 0;
-      int_mv best_mv =
-          av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
-                                   start_mvs[ref], num_planes, use_subpixel);
-      curr_var = cpi->ppi->fn_ptr[bsize].vf(
-          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
-          xd->plane[0].dst.stride, &curr_sse);
+      const int_mv best_mv = av1_simple_motion_search_sse_var(
+          cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes,
+          use_subpixel, &curr_sse, &curr_var);
       if (curr_sse < *best_sse) {
         *best_sse = curr_sse;
         *best_var = curr_var;
@@ -840,8 +836,11 @@
       unsigned int sse = 0;
       unsigned int var = 0;
       const FULLPEL_MV start_mv = kZeroFullMv;
-      int_mv best_mv = av1_simple_motion_sse_var(
-          cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var);
+      const MV_REFERENCE_FRAME ref =
+          cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+      const int_mv best_mv = av1_simple_motion_search_sse_var(
+          cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse,
+          &var);
 
       const float mv_row = (float)(best_mv.as_mv.row / 8);
       const float mv_col = (float)(best_mv.as_mv.col / 8);
@@ -1214,7 +1213,7 @@
   const PartitionBlkParams blk_params = part_state->part_blk_params;
   const int mi_row = blk_params.mi_row;
   const int mi_col = blk_params.mi_col;
-  const int bsize = blk_params.bsize;
+  const BLOCK_SIZE bsize = blk_params.bsize;
 
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   const NN_CONFIG *nn_config = NULL;
@@ -1317,7 +1316,7 @@
   const PartitionBlkParams blk_params = part_state->part_blk_params;
   const int mi_row = blk_params.mi_row;
   const int mi_col = blk_params.mi_col;
-  const int bsize = blk_params.bsize;
+  const BLOCK_SIZE bsize = blk_params.bsize;
 
   int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
   int64_t *split_rd = part_state->split_rd;
@@ -1331,6 +1330,7 @@
   int64_t *horz_rd = rect_part_rd[HORZ4];
   int64_t *vert_rd = rect_part_rd[VERT4];
   const NN_CONFIG *nn_config = NULL;
+  // 4-way partitions are only allowed for these three square block sizes.
   switch (bsize) {
     case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
     case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
@@ -1377,6 +1377,10 @@
   {
     BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
     BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+    assert(horz_4_bs != BLOCK_INVALID);
+    assert(vert_4_bs != BLOCK_INVALID);
+
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
                          av1_num_planes(&cpi->common), bsize);
     const int src_stride = x->plane[0].src.stride;
@@ -1650,17 +1654,15 @@
 
   if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
     const MACROBLOCKD *const xd = &x->e_mbd;
-    int prune_sub_8x8 = 1;
-    if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 1) {
-      int num_neighbors_lt_8x8 = 0;
-      if (xd->left_available)
-        num_neighbors_lt_8x8 += (xd->left_mbmi->bsize <= BLOCK_8X8);
-      if (xd->up_available)
-        num_neighbors_lt_8x8 += (xd->above_mbmi->bsize <= BLOCK_8X8);
-      // Evaluate only if both left and above blocks are of size <= BLOCK_8X8.
-      if (num_neighbors_lt_8x8 == 2) {
-        prune_sub_8x8 = 0;
-      }
+    int prune_sub_8x8;
+    if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) {
+      prune_sub_8x8 = 1;
+    } else {
+      assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1);
+      // Prune if both neighbors are available and either is > BLOCK_8X8
+      prune_sub_8x8 = xd->left_available && xd->up_available &&
+                      (xd->left_mbmi->bsize > BLOCK_8X8 ||
+                       xd->above_mbmi->bsize > BLOCK_8X8);
     }
     if (prune_sub_8x8) {
       av1_disable_all_splits(part_state);
@@ -1962,12 +1964,19 @@
     features->after_part_ab.f[feature_index++] = rd_ratio;
   }
 
+  // 4-way partitions are only allowed for these three square block sizes.
+  assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64);
+
   // Get variance of the 1:4 and 4:1 sub-blocks.
   unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
   unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
   {
     BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
     BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+    assert(horz_4_bs != BLOCK_INVALID);
+    assert(vert_4_bs != BLOCK_INVALID);
+
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
                          av1_num_planes(&cpi->common), bsize);
     const int src_stride = x->plane[0].src.stride;
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 46bc6b0..3641c8b 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -181,7 +181,7 @@
   // Based on recent history adjust expectations of bits per macroblock.
   double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
   double rate_err_factor = 1.0;
-  const double adj_limit = AOMMAX(0.20, (double)(100 - rate_err_tol) / 200.0);
+  const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0);
   const double min_fac = 1.0 - adj_limit;
   const double max_fac = 1.0 + adj_limit;
 
@@ -255,7 +255,6 @@
       rate_err_factor = 1.0 - ((double)(bits_off_target) /
                                AOMMAX(total_actual_bits, bits_left));
     }
-    rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
 
     // Adjustment is damped if this is 1 pass with look ahead processing
     // (as there are only ever a few frames of data) and for all but the first
@@ -263,6 +262,7 @@
     if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
       rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
     }
+    rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
   }
 
   // Is the rate control trending in the right direction. Only make
@@ -270,7 +270,12 @@
   if ((rate_err_factor < 1.0 && err_estimate >= 0) ||
       (rate_err_factor > 1.0 && err_estimate <= 0)) {
     twopass->bpm_factor *= rate_err_factor;
-    twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+    if (rate_err_tol >= 100) {
+      twopass->bpm_factor =
+          AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+    } else {
+      twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor));
+    }
   }
 }
 
@@ -1748,22 +1753,42 @@
   cleanup_regions(regions, num_regions);
 }
 
-void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
-                          int total_frames, int offset, REGIONS *regions,
-                          int *total_regions) {
+static void free_firstpass_stats_buffers(REGIONS *temp_regions,
+                                         double *filt_intra_err,
+                                         double *filt_coded_err,
+                                         double *grad_coded) {
+  aom_free(temp_regions);
+  aom_free(filt_intra_err);
+  aom_free(filt_coded_err);
+  aom_free(grad_coded);
+}
+
+// Identify stable and unstable regions from first pass stats.
+// stats_start points to the first frame to analyze.
+// |offset| is the offset from the current frame to the frame stats_start is
+// pointing to.
+// Returns 0 on success, -1 on memory allocation failure.
+static int identify_regions(const FIRSTPASS_STATS *const stats_start,
+                            int total_frames, int offset, REGIONS *regions,
+                            int *total_regions) {
   int k;
-  if (total_frames <= 1) return;
+  if (total_frames <= 1) return 0;
 
   // store the initial decisions
   REGIONS *temp_regions =
       (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0]));
-  av1_zero_array(temp_regions, total_frames);
   // buffers for filtered stats
   double *filt_intra_err =
       (double *)aom_calloc(total_frames, sizeof(*filt_intra_err));
   double *filt_coded_err =
       (double *)aom_calloc(total_frames, sizeof(*filt_coded_err));
   double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded));
+  if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) {
+    free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+                                 grad_coded);
+    return -1;
+  }
+  av1_zero_array(temp_regions, total_frames);
 
   int cur_region = 0, this_start = 0, this_last;
 
@@ -1853,10 +1878,9 @@
     regions[k].last += offset;
   }
 
-  aom_free(temp_regions);
-  aom_free(filt_coded_err);
-  aom_free(filt_intra_err);
-  aom_free(grad_coded);
+  free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+                               grad_coded);
+  return 0;
 }
 
 static int find_regions_index(const REGIONS *regions, int num_regions,
@@ -3794,6 +3818,7 @@
                                     (rc->frames_since_key == 0)));
       p_rc->frames_till_regions_update = rest_frames;
 
+      int ret;
       if (cpi->ppi->lap_enabled) {
         av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
                          twopass->stats_buf_ctx->stats_in_end);
@@ -3801,14 +3826,18 @@
                            twopass->stats_buf_ctx->stats_in_end);
         av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
                            twopass->stats_buf_ctx->stats_in_end);
-        av1_identify_regions(cpi->twopass_frame.stats_in, rest_frames,
-                             (rc->frames_since_key == 0), p_rc->regions,
-                             &p_rc->num_regions);
+        ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+                               (rc->frames_since_key == 0), p_rc->regions,
+                               &p_rc->num_regions);
       } else {
-        av1_identify_regions(
+        ret = identify_regions(
             cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
             rest_frames, 0, p_rc->regions, &p_rc->num_regions);
       }
+      if (ret == -1) {
+        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating buffers in identify_regions");
+      }
     }
 
     int cur_region_idx =
diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h
index e34454e..ff1591c 100644
--- a/av1/encoder/pass2_strategy.h
+++ b/av1/encoder/pass2_strategy.h
@@ -134,14 +134,6 @@
                        int *num_fpstats_used, int *num_fpstats_required,
                        int project_gfu_boost);
 
-// Identify stable and unstable regions from first pass stats.
-// stats_start points to the first frame to analyze.
-// |offset| is the offset from the current frame to the frame stats_start is
-// pointing to.
-void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
-                          int total_frames, int offset, REGIONS *regions,
-                          int *total_regions);
-
 void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
                       FIRSTPASS_STATS *last_stats);
 void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 293dafa..232a2f9 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -515,8 +515,12 @@
 //   fbc: Column index in units of 64x64 block
 // Returns:
 //   Nothing will be returned. Contents of cdef_search_ctx will be modified.
-void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
-                             int sb_count) {
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+                             struct aom_internal_error_info *error_info,
+                             int fbr, int fbc, int sb_count) {
+  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+  // in future to handle error propagation.
+  (void)error_info;
   const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
   const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
   const int coeff_shift = cdef_search_ctx->coeff_shift;
@@ -614,14 +618,15 @@
 //   CDEF search context.
 // Returns:
 //   Nothing will be returned. Contents of cdef_search_ctx will be modified.
-static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx) {
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx,
+                                struct aom_internal_error_info *error_info) {
   // Loop over each sb.
   for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
     for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
       // Checks if cdef processing can be skipped for particular sb.
       if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
       // Calculate mse for each sb and store the relevant sb index.
-      av1_cdef_mse_calc_block(cdef_search_ctx, fbr, fbc,
+      av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc,
                               cdef_search_ctx->sb_count);
       cdef_search_ctx->sb_count++;
     }
@@ -634,24 +639,17 @@
 //   related to CDEF search context.
 // Returns:
 //   Nothing will be returned. Contents of cdef_search_ctx will be modified.
-static AOM_INLINE bool cdef_alloc_data(CdefSearchCtx *cdef_search_ctx) {
+static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) {
   const int nvfb = cdef_search_ctx->nvfb;
   const int nhfb = cdef_search_ctx->nhfb;
-  cdef_search_ctx->sb_index =
-      aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0]));
+  CHECK_MEM_ERROR(
+      cm, cdef_search_ctx->sb_index,
+      aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0])));
   cdef_search_ctx->sb_count = 0;
-  cdef_search_ctx->mse[0] =
-      aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
-  cdef_search_ctx->mse[1] =
-      aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
-  if (!(cdef_search_ctx->sb_index && cdef_search_ctx->mse[0] &&
-        cdef_search_ctx->mse[1])) {
-    aom_free(cdef_search_ctx->sb_index);
-    aom_free(cdef_search_ctx->mse[0]);
-    aom_free(cdef_search_ctx->mse[1]);
-    return false;
-  }
-  return true;
+  CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0],
+                  aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+  CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1],
+                  aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
 }
 
 // Deallocates the memory allocated for members of CdefSearchCtx.
@@ -660,10 +658,15 @@
 //   related to CDEF search context.
 // Returns:
 //   Nothing will be returned.
-static AOM_INLINE void cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
-  aom_free(cdef_search_ctx->mse[0]);
-  aom_free(cdef_search_ctx->mse[1]);
-  aom_free(cdef_search_ctx->sb_index);
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+  if (cdef_search_ctx) {
+    aom_free(cdef_search_ctx->mse[0]);
+    cdef_search_ctx->mse[0] = NULL;
+    aom_free(cdef_search_ctx->mse[1]);
+    cdef_search_ctx->mse[1] = NULL;
+    aom_free(cdef_search_ctx->sb_index);
+    cdef_search_ctx->sb_index = NULL;
+  }
 }
 
 // Initialize the parameters related to CDEF search context.
@@ -818,14 +821,12 @@
   }
 }
 
-void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
-                     const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
-                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
-                     int skip_cdef_feature, CDEF_CONTROL cdef_control,
-                     const int is_screen_content, int non_reference_frame,
-                     int rtc_ext_rc) {
+void av1_cdef_search(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control;
+
   assert(cdef_control != CDEF_NONE);
-  if (cdef_control == CDEF_REFERENCE && non_reference_frame) {
+  if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) {
     CdefInfo *const cdef_info = &cm->cdef_info;
     cdef_info->nb_cdef_strengths = 1;
     cdef_info->cdef_bits = 0;
@@ -834,12 +835,21 @@
     return;
   }
 
+  // Indicate if external RC is used for testing
+  const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl;
   if (rtc_ext_rc) {
     av1_pick_cdef_from_qp(cm, 0, 0);
     return;
   }
+  CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method;
   if (pick_method == CDEF_PICK_FROM_Q) {
-    av1_pick_cdef_from_qp(cm, skip_cdef_feature, is_screen_content);
+    const int use_screen_content_model =
+        cm->quant_params.base_qindex >
+            AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
+                   cpi->rc.best_quality + 5) &&
+        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+    av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb,
+                          use_screen_content_model);
     return;
   }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -847,33 +857,33 @@
   const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
                     pick_method <= CDEF_FAST_SEARCH_LVL5);
   const int num_planes = av1_num_planes(cm);
-  CdefSearchCtx cdef_search_ctx;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+  if (!cpi->cdef_search_ctx)
+    CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx,
+                    aom_malloc(sizeof(*cpi->cdef_search_ctx)));
+  CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx;
+
   // Initialize parameters related to CDEF search context.
-  cdef_params_init(frame, ref, cm, xd, &cdef_search_ctx, pick_method);
+  cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx,
+                   pick_method);
   // Allocate CDEF search context buffers.
-  if (!cdef_alloc_data(&cdef_search_ctx)) {
-    CdefInfo *const cdef_info = &cm->cdef_info;
-    cdef_info->nb_cdef_strengths = 0;
-    cdef_info->cdef_bits = 0;
-    cdef_info->cdef_strengths[0] = 0;
-    cdef_info->cdef_uv_strengths[0] = 0;
-    return;
-  }
+  cdef_alloc_data(cm, cdef_search_ctx);
   // Frame level mse calculation.
-  if (mt_info->num_workers > 1) {
-    av1_cdef_mse_calc_frame_mt(cm, mt_info, &cdef_search_ctx);
+  if (cpi->mt_info.num_workers > 1) {
+    av1_cdef_mse_calc_frame_mt(cpi);
   } else {
-    cdef_mse_calc_frame(&cdef_search_ctx);
+    cdef_mse_calc_frame(cdef_search_ctx, cm->error);
   }
 
   /* Search for different number of signaling bits. */
   int nb_strength_bits = 0;
   uint64_t best_rd = UINT64_MAX;
   CdefInfo *const cdef_info = &cm->cdef_info;
-  int sb_count = cdef_search_ctx.sb_count;
+  int sb_count = cdef_search_ctx->sb_count;
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
-  mse[0] = cdef_search_ctx.mse[0];
-  mse[1] = cdef_search_ctx.mse[1];
+  mse[0] = cdef_search_ctx->mse[0];
+  mse[1] = cdef_search_ctx->mse[1];
   /* Calculate the maximum number of bits required to signal CDEF strengths at
    * block level */
   const int total_strengths = nb_cdef_strengths[pick_method];
@@ -881,6 +891,7 @@
       num_planes > 1 ? total_strengths * total_strengths : total_strengths;
   const int max_signaling_bits =
       joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1;
+  int rdmult = cpi->td.mb.rdmult;
   for (int i = 0; i <= 3; i++) {
     if (i > max_signaling_bits) break;
     int best_lev0[CDEF_MAX_STRENGTHS];
@@ -925,7 +936,7 @@
         best_mse = curr;
       }
     }
-    mi_params->mi_grid_base[cdef_search_ctx.sb_index[i]]->cdef_strength =
+    mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength =
         best_gi;
   }
   if (fast) {
@@ -943,5 +954,5 @@
 
   cdef_info->cdef_damping = damping;
   // Deallocate CDEF search context buffers.
-  cdef_dealloc_data(&cdef_search_ctx);
+  av1_cdef_dealloc_data(cdef_search_ctx);
 }
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index bdd8233..192e734 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h
@@ -213,8 +213,11 @@
   return 0;
 }
 
-void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
-                             int sb_count);
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+                             struct aom_internal_error_info *error_info,
+                             int fbr, int fbc, int sb_count);
 /*!\endcond */
 
 /*!\brief AV1 CDEF parameter search
@@ -223,19 +226,7 @@
  *
  * Searches for optimal CDEF parameters for frame
  *
- * \param[in]      mt_info      Pointer to multi-threading parameters
- * \param[in]      frame        Compressed frame buffer
- * \param[in]      ref          Source frame buffer
- * \param[in,out]  cm           Pointer to top level common structure
- * \param[in]      xd           Pointer to common current coding block structure
- * \param[in]      pick_method  The method used to select params
- * \param[in]      rdmult       rd multiplier to use in making param choices
- * \param[in]      skip_cdef_feature Speed feature to skip cdef
- * \param[in]      cdef_control  Parameter that controls CDEF application
- * \param[in]      is_screen_content   Whether it is screen content type
- * \param[in]      non_reference_frame Indicates if current frame is
- * non-reference
- * \param[in]      rtc_ext_rc   Indicate if external RC is used for testing
+ * \param[in,out]  cpi                 Top level encoder structure
  *
  * \remark Nothing is returned. Instead, optimal CDEF parameters are stored
  * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
@@ -248,13 +239,7 @@
  * \arg \c damping_factor: CDEF damping factor.
  *
  */
-void av1_cdef_search(struct MultiThreadInfo *mt_info,
-                     const YV12_BUFFER_CONFIG *frame,
-                     const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
-                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
-                     int skip_cdef_feature, CDEF_CONTROL cdef_control,
-                     const int is_screen_content, int non_reference_frame,
-                     int rtc_ext_rc);
+void av1_cdef_search(struct AV1_COMP *cpi);
 
 /*!\brief AV1 CDEF level from QP
  *
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 7212469..6429064 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -52,6 +52,11 @@
   { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
 };
 
+#if DEBUG_LR_COSTING
+RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+                                 [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif  // DEBUG_LR_COSTING
+
 typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
                                       const YV12_BUFFER_CONFIG *b);
 typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
@@ -100,31 +105,14 @@
 }
 
 typedef struct {
-  // The best coefficients for Wiener or Sgrproj restoration
-  WienerInfo wiener;
-  SgrprojInfo sgrproj;
-
-  // The sum of squared errors for this rtype.
-  int64_t sse[RESTORE_SWITCHABLE_TYPES];
-
-  // The rtype to use for this unit given a frame rtype as
-  // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
-  RestorationType best_rtype[RESTORE_TYPES - 1];
-
-  // This flag will be set based on the speed feature
-  // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
-  uint8_t skip_sgr_eval;
-} RestUnitSearchInfo;
-
-typedef struct {
   const YV12_BUFFER_CONFIG *src;
   YV12_BUFFER_CONFIG *dst;
 
   const AV1_COMMON *cm;
   const MACROBLOCK *x;
   int plane;
-  int plane_width;
-  int plane_height;
+  int plane_w;
+  int plane_h;
   RestUnitSearchInfo *rusi;
 
   // Speed features
@@ -135,16 +123,32 @@
   const uint8_t *src_buffer;
   int src_stride;
 
-  // sse and bits are initialised by reset_rsc in search_rest_type
-  int64_t sse;
-  int64_t bits;
-  int tile_y0, tile_stripe0;
+  // SSE values for each restoration mode for the current RU
+  // These are saved by each search function for use in search_switchable()
+  int64_t sse[RESTORE_SWITCHABLE_TYPES];
 
-  // sgrproj and wiener are initialised by rsc_on_tile when starting the first
-  // tile in the frame.
-  SgrprojInfo sgrproj;
-  WienerInfo wiener;
-  PixelRect tile_rect;
+  // This flag will be set based on the speed feature
+  // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+  uint8_t skip_sgr_eval;
+
+  // Total rate and distortion so far for each restoration type
+  // These are initialised by reset_rsc in search_rest_type
+  int64_t total_sse[RESTORE_TYPES];
+  int64_t total_bits[RESTORE_TYPES];
+
+  // Reference parameters for delta-coding
+  //
+  // For each restoration type, we need to store the latest parameter set which
+  // has been used, so that we can properly cost up the next parameter set.
+  // Note that we have two sets of these - one for the single-restoration-mode
+  // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ)
+  // and one for the switchable mode. This is because these two cases can lead
+  // to different sets of parameters being signaled, but we don't know which
+  // we will pick for sure until the end of the search process.
+  WienerInfo ref_wiener;
+  SgrprojInfo ref_sgrproj;
+  WienerInfo switchable_ref_wiener;
+  SgrprojInfo switchable_ref_sgrproj;
 
   // Buffers used to hold dgd-avg and src-avg data respectively during SIMD
   // call of Wiener filter.
@@ -154,14 +158,15 @@
 
 static AOM_INLINE void rsc_on_tile(void *priv) {
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
-  set_default_sgrproj(&rsc->sgrproj);
-  set_default_wiener(&rsc->wiener);
-  rsc->tile_stripe0 = 0;
+  set_default_wiener(&rsc->ref_wiener);
+  set_default_sgrproj(&rsc->ref_sgrproj);
+  set_default_wiener(&rsc->switchable_ref_wiener);
+  set_default_sgrproj(&rsc->switchable_ref_sgrproj);
 }
 
 static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
-  rsc->sse = 0;
-  rsc->bits = 0;
+  memset(rsc->total_sse, 0, sizeof(rsc->total_sse));
+  memset(rsc->total_bits, 0, sizeof(rsc->total_bits));
 }
 
 static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
@@ -179,20 +184,23 @@
 
   const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
   const int is_uv = plane != AOM_PLANE_Y;
-  rsc->plane_width = src->crop_widths[is_uv];
-  rsc->plane_height = src->crop_heights[is_uv];
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+  assert(plane_w == src->crop_widths[is_uv]);
+  assert(plane_h == src->crop_heights[is_uv]);
+  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+
+  rsc->plane_w = plane_w;
+  rsc->plane_h = plane_h;
   rsc->src_buffer = src->buffers[plane];
   rsc->src_stride = src->strides[is_uv];
   rsc->dgd_buffer = dgd->buffers[plane];
   rsc->dgd_stride = dgd->strides[is_uv];
-  rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
-  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
-  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
 }
 
 static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
                                     const RestorationTileLimits *limits,
-                                    const PixelRect *tile_rect,
                                     const RestorationUnitInfo *rui) {
   const AV1_COMMON *const cm = rsc->cm;
   const int plane = rsc->plane;
@@ -208,11 +216,11 @@
   const int optimized_lr = 0;
 
   av1_loop_restoration_filter_unit(
-      limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+      limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h,
       is_uv && cm->seq_params->subsampling_x,
       is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
       fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
-      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error);
 
   return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
 }
@@ -746,7 +754,8 @@
                                  int width, int height, int dat_stride,
                                  int use_highbd, int bit_depth, int pu_width,
                                  int pu_height, int32_t *flt0, int32_t *flt1,
-                                 int flt_stride) {
+                                 int flt_stride,
+                                 struct aom_internal_error_info *error_info) {
   for (int i = 0; i < height; i += pu_height) {
     const int h = AOMMIN(pu_height, height - i);
     int32_t *flt0_row = flt0 + i * flt_stride;
@@ -756,11 +765,13 @@
     // Iterate over the stripe in blocks of width pu_width
     for (int j = 0; j < width; j += pu_width) {
       const int w = AOMMIN(pu_width, width - j);
-      const int ret = av1_selfguided_restoration(
-          dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
-          flt_stride, sgr_params_idx, bit_depth, use_highbd);
-      (void)ret;
-      assert(!ret);
+      if (av1_selfguided_restoration(
+              dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+              flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) {
+        aom_internal_error(
+            error_info, AOM_CODEC_MEM_ERROR,
+            "Error allocating buffer in av1_selfguided_restoration");
+      }
     }
   }
 }
@@ -770,10 +781,11 @@
     const int dat_stride, const uint8_t *src8, const int src_stride,
     const int use_highbitdepth, const int bit_depth, const int pu_width,
     const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
-    const int flt_stride, int *exqd, int64_t *err) {
+    const int flt_stride, int *exqd, int64_t *err,
+    struct aom_internal_error_info *error_info) {
   int exq[2];
   apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
-            pu_width, pu_height, flt0, flt1, flt_stride);
+            pu_width, pu_height, flt0, flt1, flt_stride, error_info);
   const sgr_params_type *const params = &av1_sgr_params[ep];
   get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
                     use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
@@ -798,7 +810,8 @@
 static SgrprojInfo search_selfguided_restoration(
     const uint8_t *dat8, int width, int height, int dat_stride,
     const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
-    int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) {
+    int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning,
+    struct aom_internal_error_info *error_info) {
   int32_t *flt0 = rstbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   int ep, idx, bestep = 0;
@@ -814,7 +827,7 @@
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
                           use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
   } else {
@@ -824,7 +837,7 @@
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
                           use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
     // evaluate left and right ep of winner in seed ep
@@ -835,7 +848,7 @@
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
                           use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
     // evaluate last two group
@@ -844,7 +857,7 @@
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
                           use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          flt0, flt1, flt_stride, exqd, &err, error_info);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
   }
@@ -873,10 +886,10 @@
   return bits;
 }
 
-static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
-                                      const PixelRect *tile, int rest_unit_idx,
-                                      void *priv, int32_t *tmpbuf,
-                                      RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_sgrproj(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
   (void)rlbs;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
@@ -888,11 +901,11 @@
 
   const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
   // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
-  if (rusi->skip_sgr_eval) {
-    rsc->bits += bits_none;
-    rsc->sse += rusi->sse[RESTORE_NONE];
+  if (rsc->skip_sgr_eval) {
+    rsc->total_bits[RESTORE_SGRPROJ] += bits_none;
+    rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE];
     rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
-    rusi->sse[RESTORE_SGRPROJ] = INT64_MAX;
+    rsc->sse[RESTORE_SGRPROJ] = INT64_MAX;
     return;
   }
 
@@ -911,21 +924,22 @@
       dgd_start, limits->h_end - limits->h_start,
       limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
       rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
-      tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning);
+      tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info);
 
   RestorationUnitInfo rui;
   rui.restoration_type = RESTORE_SGRPROJ;
   rui.sgrproj_info = rusi->sgrproj;
 
-  rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+  rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui);
 
-  const int64_t bits_sgr = x->mode_costs.sgrproj_restore_cost[1] +
-                           (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
-                            << AV1_PROB_COST_SHIFT);
+  const int64_t bits_sgr =
+      x->mode_costs.sgrproj_restore_cost[1] +
+      (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj)
+       << AV1_PROB_COST_SHIFT);
   double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE], bit_depth);
+      x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth);
   double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ], bit_depth);
+      x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth);
   if (rusi->sgrproj.ep < 10)
     cost_sgr *=
         (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
@@ -934,9 +948,16 @@
       (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
   rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
 
-  rsc->sse += rusi->sse[rtype];
-  rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
-  if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
+#if DEBUG_LR_COSTING
+  // Store ref params for later checking
+  lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info =
+      rsc->ref_sgrproj;
+#endif  // DEBUG_LR_COSTING
+
+  rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype];
+  rsc->total_bits[RESTORE_SGRPROJ] +=
+      (cost_sgr < cost_none) ? bits_sgr : bits_none;
+  if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj;
 }
 
 static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src,
@@ -1455,13 +1476,11 @@
   return bits;
 }
 
-static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
-                                        const RestorationTileLimits *limits,
-                                        const PixelRect *tile,
-                                        RestorationUnitInfo *rui,
-                                        int wiener_win) {
+static int64_t finer_search_wiener(const RestSearchCtxt *rsc,
+                                   const RestorationTileLimits *limits,
+                                   RestorationUnitInfo *rui, int wiener_win) {
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
-  int64_t err = try_restoration_unit(rsc, limits, tile, rui);
+  int64_t err = try_restoration_unit(rsc, limits, rui);
 
   if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err;
 
@@ -1484,7 +1503,7 @@
           plane_wiener->hfilter[p] -= s;
           plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
           plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
-          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          err2 = try_restoration_unit(rsc, limits, rui);
           if (err2 > err) {
             plane_wiener->hfilter[p] += s;
             plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
@@ -1504,7 +1523,7 @@
           plane_wiener->hfilter[p] += s;
           plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
           plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
-          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          err2 = try_restoration_unit(rsc, limits, rui);
           if (err2 > err) {
             plane_wiener->hfilter[p] -= s;
             plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
@@ -1525,7 +1544,7 @@
           plane_wiener->vfilter[p] -= s;
           plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
           plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
-          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          err2 = try_restoration_unit(rsc, limits, rui);
           if (err2 > err) {
             plane_wiener->vfilter[p] += s;
             plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
@@ -1545,7 +1564,7 @@
           plane_wiener->vfilter[p] += s;
           plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
           plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
-          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          err2 = try_restoration_unit(rsc, limits, rui);
           if (err2 > err) {
             plane_wiener->vfilter[p] -= s;
             plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
@@ -1564,13 +1583,13 @@
   return err;
 }
 
-static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
-                                     const PixelRect *tile_rect,
-                                     int rest_unit_idx, void *priv,
-                                     int32_t *tmpbuf,
-                                     RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_wiener(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
   (void)tmpbuf;
   (void)rlbs;
+  (void)error_info;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
@@ -1592,13 +1611,13 @@
         var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
     // Do not perform Wiener search if source variance is lower than threshold
     // or if the reconstruction error is zero
-    int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0);
+    int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0);
     if (prune_wiener) {
-      rsc->bits += bits_none;
-      rsc->sse += rusi->sse[RESTORE_NONE];
+      rsc->total_bits[RESTORE_WIENER] += bits_none;
+      rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
       rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
-      rusi->sse[RESTORE_WIENER] = INT64_MAX;
-      if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+      rsc->sse[RESTORE_WIENER] = INT64_MAX;
+      if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
       return;
     }
   }
@@ -1654,16 +1673,16 @@
   // reduction in the function, the filter is reverted back to identity
   if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
                     rui.wiener_info.hfilter) > 0) {
-    rsc->bits += bits_none;
-    rsc->sse += rusi->sse[RESTORE_NONE];
+    rsc->total_bits[RESTORE_WIENER] += bits_none;
+    rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
     rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
-    rusi->sse[RESTORE_WIENER] = INT64_MAX;
-    if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+    rsc->sse[RESTORE_WIENER] = INT64_MAX;
+    if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
     return;
   }
 
-  rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener(
-      rsc, limits, tile_rect, &rui, reduced_wiener_win);
+  rsc->sse[RESTORE_WIENER] =
+      finer_search_wiener(rsc, limits, &rui, reduced_wiener_win);
   rusi->wiener = rui.wiener_info;
 
   if (reduced_wiener_win != WIENER_WIN) {
@@ -1675,14 +1694,14 @@
 
   const int64_t bits_wiener =
       x->mode_costs.wiener_restore_cost[1] +
-      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener)
        << AV1_PROB_COST_SHIFT);
 
   double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE],
+      x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE],
       rsc->cm->seq_params->bit_depth);
   double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER],
+      x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER],
       rsc->cm->seq_params->bit_depth);
 
   RestorationType rtype =
@@ -1692,44 +1711,49 @@
   // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
   // RESTORE_NONE or based on best_rtype
   if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
-    rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+    rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
   } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
-    rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+    rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
   }
 
-  rsc->sse += rusi->sse[rtype];
-  rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
-  if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
+#if DEBUG_LR_COSTING
+  // Store ref params for later checking
+  lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info =
+      rsc->ref_wiener;
+#endif  // DEBUG_LR_COSTING
+
+  rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype];
+  rsc->total_bits[RESTORE_WIENER] +=
+      (cost_wiener < cost_none) ? bits_wiener : bits_none;
+  if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener;
 }
 
-static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
-                                        const PixelRect *tile_rect,
-                                        int rest_unit_idx, void *priv,
-                                        int32_t *tmpbuf,
-                                        RestorationLineBuffers *rlbs) {
-  (void)tile_rect;
+static AOM_INLINE void search_norestore(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
+  (void)rest_unit_idx;
   (void)tmpbuf;
   (void)rlbs;
+  (void)error_info;
 
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
-  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
   const int highbd = rsc->cm->seq_params->use_highbitdepth;
-  rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+  rsc->sse[RESTORE_NONE] = sse_restoration_unit(
       limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
 
-  rsc->sse += rusi->sse[RESTORE_NONE];
+  rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE];
 }
 
-static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
-                                         const PixelRect *tile_rect,
-                                         int rest_unit_idx, void *priv,
-                                         int32_t *tmpbuf,
-                                         RestorationLineBuffers *rlbs) {
+static AOM_INLINE void search_switchable(
+    const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    struct aom_internal_error_info *error_info) {
   (void)limits;
-  (void)tile_rect;
   (void)tmpbuf;
   (void)rlbs;
+  (void)error_info;
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
@@ -1743,24 +1767,32 @@
   RestorationType best_rtype = RESTORE_NONE;
 
   for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
-    // Check for the condition that wiener or sgrproj search could not
-    // find a solution or the solution was worse than RESTORE_NONE.
-    // In either case the best_rtype will be set as RESTORE_NONE. These
-    // should be skipped from the test below.
+    // If this restoration mode was skipped, or could not find a solution
+    // that was better than RESTORE_NONE, then we can't select it here either.
+    //
+    // Note: It is possible for the restoration search functions to find a
+    // filter which is better than RESTORE_NONE when looking purely at SSE, but
+    // for it to be rejected overall due to its rate cost. In this case, there
+    // is a chance that it may be have a lower rate cost when looking at
+    // RESTORE_SWITCHABLE, and so it might be acceptable here.
+    //
+    // Therefore we prune based on SSE, rather than on whether or not the
+    // previous search function selected this mode.
     if (r > RESTORE_NONE) {
-      if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
+      if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue;
     }
 
-    const int64_t sse = rusi->sse[r];
+    const int64_t sse = rsc->sse[r];
     int64_t coeff_pcost = 0;
     switch (r) {
       case RESTORE_NONE: coeff_pcost = 0; break;
       case RESTORE_WIENER:
-        coeff_pcost =
-            count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+        coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener,
+                                        &rsc->switchable_ref_wiener);
         break;
       case RESTORE_SGRPROJ:
-        coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+        coeff_pcost =
+            count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj);
         break;
       default: assert(0); break;
     }
@@ -1779,10 +1811,19 @@
 
   rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
 
-  rsc->sse += rusi->sse[best_rtype];
-  rsc->bits += best_bits;
-  if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
-  if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
+#if DEBUG_LR_COSTING
+  // Store ref params for later checking
+  lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info =
+      rsc->switchable_ref_wiener;
+  lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info =
+      rsc->switchable_ref_sgrproj;
+#endif  // DEBUG_LR_COSTING
+
+  rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype];
+  rsc->total_bits[RESTORE_SWITCHABLE] += best_bits;
+  if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener;
+  if (best_rtype == RESTORE_SGRPROJ)
+    rsc->switchable_ref_sgrproj = rusi->sgrproj;
 }
 
 static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
@@ -1796,23 +1837,96 @@
     rui->sgrproj_info = rusi->sgrproj;
 }
 
-static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc,
+                               bool *disable_lr_filter) {
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const CommonTileParams *tiles = &cm->tiles;
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
+  RestorationInfo *rsi = &cm->rst_info[plane];
+  const int ru_size = rsi->restoration_unit_size;
+  const int ext_size = ru_size * 3 / 2;
+
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
   static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
     search_norestore, search_wiener, search_sgrproj, search_switchable
   };
 
+  const int plane_num_units = rsi->num_rest_units;
+  const RestorationType num_rtypes =
+      (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
   reset_rsc(rsc);
-  rsc_on_tile(rsc);
 
-  av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
-                                 &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
-  return RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params->bit_depth);
-}
+  // Iterate over restoration units in encoding order, so that each RU gets
+  // the correct reference parameters when we cost it up. This is effectively
+  // a nested iteration over:
+  // * Each tile, order does not matter
+  //   * Each superblock within that tile, in raster order
+  //     * Each LR unit which is coded within that superblock, in raster order
+  for (int tile_row = 0; tile_row < tiles->rows; tile_row++) {
+    int sb_row_start = tiles->row_start_sb[tile_row];
+    int sb_row_end = tiles->row_start_sb[tile_row + 1];
+    for (int tile_col = 0; tile_col < tiles->cols; tile_col++) {
+      int sb_col_start = tiles->col_start_sb[tile_col];
+      int sb_col_end = tiles->col_start_sb[tile_col + 1];
 
-static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
-  const RestorationInfo *rsi = &cm->rst_info[plane];
-  return rsi->units_per_tile;
+      // Reset reference parameters for delta-coding at the start of each tile
+      rsc_on_tile(rsc);
+
+      for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) {
+        int mi_row = sb_row << mib_size_log2;
+        for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) {
+          int mi_col = sb_col << mib_size_log2;
+
+          int rcol0, rcol1, rrow0, rrow1;
+          int has_lr_info = av1_loop_restoration_corners_in_sb(
+              cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0,
+              &rrow1);
+
+          if (!has_lr_info) continue;
+
+          RestorationTileLimits limits;
+          for (int rrow = rrow0; rrow < rrow1; rrow++) {
+            int y0 = rrow * ru_size;
+            int remaining_h = plane_h - y0;
+            int h = (remaining_h < ext_size) ? remaining_h : ru_size;
+
+            limits.v_start = y0;
+            limits.v_end = y0 + h;
+            assert(limits.v_end <= plane_h);
+            // Offset upwards to align with the restoration processing stripe
+            const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+            limits.v_start = AOMMAX(0, limits.v_start - voffset);
+            if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+            for (int rcol = rcol0; rcol < rcol1; rcol++) {
+              int x0 = rcol * ru_size;
+              int remaining_w = plane_w - x0;
+              int w = (remaining_w < ext_size) ? remaining_w : ru_size;
+
+              limits.h_start = x0;
+              limits.h_end = x0 + w;
+              assert(limits.h_end <= plane_w);
+
+              const int unit_idx = rrow * rsi->horz_units + rcol;
+
+              rsc->skip_sgr_eval = 0;
+              for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) {
+                if (disable_lr_filter[r]) continue;
+
+                funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL,
+                        cm->error);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 static INLINE void av1_derive_flags_for_lr_processing(
@@ -1833,31 +1947,101 @@
       (is_wiener_disabled || is_sgr_disabled);
 }
 
-void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  const SequenceHeader *const seq_params = cm->seq_params;
-  const int num_planes = av1_num_planes(cm);
-  assert(!cm->features.all_lossless);
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+// Allocate both decoder-side and encoder-side info structs for a single plane.
+// The unit size passed in should be the minimum size which we are going to
+// search; before each search, set_restoration_unit_size() must be called to
+// configure the actual size.
+static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm,
+                                                   RestorationInfo *rsi,
+                                                   int is_uv,
+                                                   int min_luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int sx = cm->seq_params.subsampling_x;
+  int sy = cm->seq_params.subsampling_y;
+  int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int min_unit_size = min_luma_unit_size >> s;
 
-  av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 
-  int ntiles[2];
-  for (int is_uv = 0; is_uv < 2; ++is_uv)
-    ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+  const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w);
+  const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h);
+  const int max_num_units = max_horz_units * max_vert_units;
 
-  assert(ntiles[1] <= ntiles[0]);
+  aom_free(rsi->unit_info);
+  CHECK_MEM_ERROR(cm, rsi->unit_info,
+                  (RestorationUnitInfo *)aom_memalign(
+                      16, sizeof(*rsi->unit_info) * max_num_units));
+
   RestUnitSearchInfo *rusi;
   CHECK_MEM_ERROR(
       cm, rusi,
-      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]));
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units));
 
   // If the restoration unit dimensions are not multiples of
   // rsi->restoration_unit_size then some elements of the rusi array may be
   // left uninitialised when we reach copy_unit_info(...). This is not a
   // problem, as these elements are ignored later, but in order to quiet
   // Valgrind's warnings we initialise the array below.
-  memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+  memset(rusi, 0, sizeof(*rusi) * max_num_units);
+
+  return rusi;
+}
+
+static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi,
+                                      int is_uv, int luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int sx = cm->seq_params.subsampling_x;
+  int sy = cm->seq_params.subsampling_y;
+  int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int unit_size = luma_unit_size >> s;
+
+  int plane_w, plane_h;
+  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+  const int horz_units = av1_lr_count_units(unit_size, plane_w);
+  const int vert_units = av1_lr_count_units(unit_size, plane_h);
+
+  rsi->restoration_unit_size = unit_size;
+  rsi->num_rest_units = horz_units * vert_units;
+  rsi->horz_units = horz_units;
+  rsi->vert_units = vert_units;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
+  const int num_planes = av1_num_planes(cm);
+  const int highbd = cm->seq_params->use_highbitdepth;
+  assert(!cm->features.all_lossless);
+
+  av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
+  // Select unit size based on speed feature settings, and allocate
+  // rui structs based on this size
+  int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+  int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size;
+
+  // The minimum allowed unit size at a syntax level is 1 superblock.
+  // Apply this constraint here so that the speed features code which sets
+  // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size
+  min_lr_unit_size =
+      AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs(
+        cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size);
+  }
+
   x->rdmult = cpi->rd.RDMULT;
 
   // Allocate the frame buffer trial_frame_rst, which is used to temporarily
@@ -1865,33 +2049,32 @@
   if (aom_realloc_frame_buffer(
           &cpi->trial_frame_rst, cm->superres_upscaled_width,
           cm->superres_upscaled_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_RESTORATION_FRAME_BORDER, cm->features.byte_alignment, NULL, NULL,
-          NULL, 0, 0))
+          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+          cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
 
   RestSearchCtxt rsc;
 
   // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers.
-  // These buffers are required for AVX2 SIMD purpose only. Hence, allocated the
-  // same if AVX2 variant of SIMD for av1_compute_stats() is enabled. The buffer
-  // size required is calculated based on maximum width and height of the LRU
-  // (i.e., from foreach_rest_unit_in_tile() 1.5 times the
-  // RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width and
-  // height aligned to multiple of 16 is considered for intrinsic purpose.
+  // These buffers are only required for the AVX2 and NEON implementations of
+  // av1_compute_stats. The buffer size required is calculated based on maximum
+  // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5
+  // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width
+  // and height aligned to multiple of 16 is considered for intrinsic purpose.
   rsc.dgd_avg = NULL;
   rsc.src_avg = NULL;
-#if HAVE_AVX2
+#if HAVE_AVX2 || HAVE_NEON
   // The buffers allocated below are used during Wiener filter processing of low
   // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
   // low bitdepth path.
-  if (!cpi->sf.lpf_sf.disable_wiener_filter &&
-      !cm->seq_params->use_highbitdepth) {
-    const int buf_size = sizeof(*rsc.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX *
-                         RESTORATION_UNITSIZE_MAX;
-    CHECK_MEM_ERROR(cm, rsc.dgd_avg, (int16_t *)aom_memalign(32, buf_size));
+  if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+    const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
+                         RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+    CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg,
+                    (int16_t *)aom_memalign(32, buf_size));
 
+    rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg;
     // When LRU width isn't multiple of 16, the 256 bits load instruction used
     // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to
     // silence Valgrind warning this buffer is initialized with zero. Overhead
@@ -1904,59 +2087,131 @@
   }
 #endif
 
-  const int plane_start = AOM_PLANE_Y;
-  const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+  // Initialize all planes, so that any planes we skip searching will still have
+  // valid data
+  for (int plane = 0; plane < num_planes; plane++) {
+    cm->rst_info[plane].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // Decide which planes to search
+  int plane_start, plane_end;
+
+  if (lpf_sf->disable_loop_restoration_luma) {
+    plane_start = AOM_PLANE_U;
+  } else {
+    plane_start = AOM_PLANE_Y;
+  }
+
+  if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) {
+    plane_end = AOM_PLANE_Y;
+  } else {
+    plane_end = AOM_PLANE_V;
+  }
 
   // Derive the flags to enable/disable Loop restoration filters based on the
   // speed features 'disable_wiener_filter' and 'disable_sgr_filter'.
   bool disable_lr_filter[RESTORE_TYPES] = { false };
-  const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
   av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter);
 
-  for (int plane = plane_start; plane <= plane_end; ++plane) {
-    init_rsc(src, &cpi->common, x, lpf_sf, plane, rusi, &cpi->trial_frame_rst,
-             &rsc);
+  for (int plane = plane_start; plane <= plane_end; plane++) {
+    const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+    const int is_uv = plane != AOM_PLANE_Y;
+    int plane_w, plane_h;
+    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+    av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv],
+                     RESTORATION_BORDER, RESTORATION_BORDER, highbd);
+  }
 
-    const int plane_ntiles = ntiles[plane > 0];
-    const RestorationType num_rtypes =
-        (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+  double best_cost = DBL_MAX;
+  int best_luma_unit_size = max_lr_unit_size;
+  for (int luma_unit_size = max_lr_unit_size;
+       luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) {
+    int64_t bits_this_size = 0;
+    int64_t sse_this_size = 0;
+    RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE,
+                                                 RESTORE_NONE };
+    for (int plane = plane_start; plane <= plane_end; ++plane) {
+      set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+                                luma_unit_size);
+      init_rsc(src, &cpi->common, x, lpf_sf, plane,
+               cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc);
 
-    double best_cost = 0;
-    RestorationType best_rtype = RESTORE_NONE;
+      restoration_search(cm, plane, &rsc, disable_lr_filter);
 
-    const int highbd = rsc.cm->seq_params->use_highbitdepth;
-    if ((plane && !lpf_sf->disable_loop_restoration_chroma) ||
-        (!plane && !lpf_sf->disable_loop_restoration_luma)) {
-      av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
-                       rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
-                       highbd);
-
+      const int plane_num_units = cm->rst_info[plane].num_rest_units;
+      const RestorationType num_rtypes =
+          (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+      double best_cost_this_plane = DBL_MAX;
       for (RestorationType r = 0; r < num_rtypes; ++r) {
         // Disable Loop restoration filter based on the flags set using speed
         // feature 'disable_wiener_filter' and 'disable_sgr_filter'.
         if (disable_lr_filter[r]) continue;
 
-        double cost = search_rest_type(&rsc, r);
+        double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+            x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r],
+            cm->seq_params->bit_depth);
 
-        if (r == 0 || cost < best_cost) {
-          best_cost = cost;
-          best_rtype = r;
+        if (cost_this_plane < best_cost_this_plane) {
+          best_cost_this_plane = cost_this_plane;
+          best_rtype[plane] = r;
         }
       }
+
+      bits_this_size += rsc.total_bits[best_rtype[plane]];
+      sse_this_size += rsc.total_sse[best_rtype[plane]];
     }
 
-    cm->rst_info[plane].frame_restoration_type = best_rtype;
-    if (best_rtype != RESTORE_NONE) {
-      for (int u = 0; u < plane_ntiles; ++u) {
-        copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+    double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, bits_this_size >> 4, sse_this_size,
+        cm->seq_params->bit_depth);
+
+    if (cost_this_size < best_cost) {
+      best_cost = cost_this_size;
+      best_luma_unit_size = luma_unit_size;
+      // Copy parameters out of rusi struct, before we overwrite it at
+      // the start of the next iteration
+      bool all_none = true;
+      for (int plane = plane_start; plane <= plane_end; ++plane) {
+        cm->rst_info[plane].frame_restoration_type = best_rtype[plane];
+        if (best_rtype[plane] != RESTORE_NONE) {
+          all_none = false;
+          const int plane_num_units = cm->rst_info[plane].num_rest_units;
+          for (int u = 0; u < plane_num_units; ++u) {
+            copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u],
+                           &cm->rst_info[plane].unit_info[u]);
+          }
+        }
       }
+      // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we
+      // couldn't find any good filters at this size. So we likely won't find
+      // any good filters at a smaller size either, so skip
+      if (all_none) {
+        break;
+      }
+    } else {
+      // Heuristic: If this size is worse than the previous (larger) size, then
+      // the next size down will likely be even worse, so skip
+      break;
     }
   }
-#if HAVE_AVX2
-  if (!cpi->sf.lpf_sf.disable_wiener_filter &&
-      !cm->seq_params->use_highbitdepth) {
-    aom_free(rsc.dgd_avg);
+
+  // Final fixup to set the correct unit size
+  // We set this for all planes, even ones we have skipped searching,
+  // so that other code does not need to care which planes were and weren't
+  // searched
+  for (int plane = 0; plane < num_planes; ++plane) {
+    set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+                              best_luma_unit_size);
+  }
+
+#if HAVE_AVX || HAVE_NEON
+  if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+    aom_free(cpi->pick_lr_ctxt.dgd_avg);
+    cpi->pick_lr_ctxt.dgd_avg = NULL;
   }
 #endif
-  aom_free(rusi);
+  for (int plane = 0; plane < num_planes; plane++) {
+    aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+    cpi->pick_lr_ctxt.rusi[plane] = NULL;
+  }
 }
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index 94a6932..d1d0b0c 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -20,6 +20,34 @@
 struct yv12_buffer_config;
 struct AV1_COMP;
 
+// Enable extra debugging for loop restoration costing?
+//
+// If this is set to 1, then we record not just the selected LR parameters, but
+// also the values which the search process thinks they should be delta-coded
+// against. Then, when writing out the bitstream, we verify this information,
+// to help ensure that the search code is costing things properly
+#define DEBUG_LR_COSTING 0
+
+#if DEBUG_LR_COSTING
+#define MAX_LR_UNITS_W 64
+#define MAX_LR_UNITS_H 64
+
+// Storage for reference parameters.
+//
+// The storage size is determined by:
+// * This is always written and then checked within the same frame encode pass,
+//   so we do not need to buffer multiple frames of data
+// * The parameters can be different per plane within one frame
+// * The relevant set of ref parameters can differ between the search where
+//   we set the frame restoration mode to RESTORE_WIENER, and the search where
+//   we set it to RESTORE_SWITCHABLE.
+//   So we need to store at least two sets of Wiener params and two sets of
+//   SGR params, and the easiest way to do this is to index by
+//   frame_restoration_type
+extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+                                        [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif  // DEBUG_LR_COSTING
+
 static const uint8_t g_shuffle_stats_data[16] = {
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 };
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index fdf1495..de0f596 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -334,7 +334,7 @@
                                        double framerate) {
   // Assume we do not need any constraint lower than 4K 20 fps
   static const double factor_safe = 3840 * 2160 * 20.0;
-  const double factor = width * height * framerate;
+  const double factor = (double)width * height * framerate;
   const int default_interval =
       clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
 
@@ -453,12 +453,19 @@
 #else
   int64_t buffer_level = p_rc->buffer_level;
 #endif
-
-  if (!oxcf->rc_cfg.drop_frames_water_mark) {
+  // Never drop on key frame, or for frame whose base layer is key.
+  // If drop_count_consec hits or exceeds max_consec_drop then don't drop.
+  if (cpi->common.current_frame.frame_type == KEY_FRAME ||
+      (cpi->ppi->use_svc &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+      !oxcf->rc_cfg.drop_frames_water_mark ||
+      (rc->max_consec_drop > 0 &&
+       rc->drop_count_consec >= rc->max_consec_drop)) {
     return 0;
   } else {
     if (buffer_level < 0) {
       // Always drop if buffer is below 0.
+      rc->drop_count_consec++;
       return 1;
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
@@ -473,6 +480,7 @@
       if (rc->decimation_factor > 0) {
         if (rc->decimation_count > 0) {
           --rc->decimation_count;
+          rc->drop_count_consec++;
           return 1;
         } else {
           rc->decimation_count = rc->decimation_factor;
@@ -493,8 +501,16 @@
   const AV1_COMMON *const cm = &cpi->common;
   const SVC *const svc = &cpi->svc;
   const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  // Flag to indicate previous frame has overshoot, and buffer level
+  // for current frame is low (less than ~half of optimal). For such
+  // (inter) frames, if the source_sad is non-zero, relax the max_delta_up
+  // and clamp applied below.
+  const bool overshoot_buffer_low =
+      cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 &&
+      p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) &&
+      rc->frames_since_key > 4;
   int max_delta_down;
-  int max_delta_up = 20;
+  int max_delta_up = overshoot_buffer_low ? 60 : 20;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
       0.1 * (rc->avg_frame_bandwidth);
@@ -543,7 +559,7 @@
     // not been set due to dropped frames.
     if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
         rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 &&
-        rc->q_2_frame > 0) {
+        rc->q_2_frame > 0 && !overshoot_buffer_low) {
       int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
                          AOMMAX(rc->q_1_frame, rc->q_2_frame));
       // If the previous frame had overshoot and the current q needs to
@@ -2343,6 +2359,7 @@
   rc->prev_coded_height = cm->height;
   rc->frame_number_encoded++;
   rc->prev_frame_is_dropped = 0;
+  rc->drop_count_consec = 0;
   // if (current_frame->frame_number == 1 && cm->show_frame)
   /*
   rc->this_frame_target =
@@ -2957,10 +2974,8 @@
   }
   if (width != cm->render_width || height != cm->render_height ||
       unscaled_src == NULL || unscaled_last_src == NULL) {
-    if (cpi->src_sad_blk_64x64) {
-      aom_free(cpi->src_sad_blk_64x64);
-      cpi->src_sad_blk_64x64 = NULL;
-    }
+    aom_free(cpi->src_sad_blk_64x64);
+    cpi->src_sad_blk_64x64 = NULL;
   }
   if (unscaled_src == NULL || unscaled_last_src == NULL) return;
   src_y = unscaled_src->y_buffer;
@@ -2972,10 +2987,8 @@
   last_src_width = unscaled_last_src->y_width;
   last_src_height = unscaled_last_src->y_height;
   if (src_width != last_src_width || src_height != last_src_height) {
-    if (cpi->src_sad_blk_64x64) {
-      aom_free(cpi->src_sad_blk_64x64);
-      cpi->src_sad_blk_64x64 = NULL;
-    }
+    aom_free(cpi->src_sad_blk_64x64);
+    cpi->src_sad_blk_64x64 = NULL;
     return;
   }
   rc->high_source_sad = 0;
@@ -2990,13 +3003,18 @@
   }
   int num_zero_temp_sad = 0;
   uint32_t min_thresh = 10000;
-  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) min_thresh = 100000;
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+    min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0
+                     ? 50000
+                     : 100000;
+  }
   const BLOCK_SIZE bsize = BLOCK_64X64;
   // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
   uint64_t avg_sad = 0;
   uint64_t tmp_sad = 0;
   int num_samples = 0;
-  const int thresh = 6;
+  const int thresh =
+      cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6;
   // SAD is computed on 64x64 blocks
   const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
                                 ? (cm->seq_params->mib_size >> 1)
@@ -3127,6 +3145,10 @@
   int qindex;
   double tot_scale_change = (double)(resize_width * resize_height) /
                             (double)(prev_width * prev_height);
+  // Disable the skip mv search for svc on resize frame.
+  svc->skip_mvsearch_last = 0;
+  svc->skip_mvsearch_gf = 0;
+  svc->skip_mvsearch_altref = 0;
   // Reset buffer level to optimal, update target size.
   p_rc->buffer_level = p_rc->optimal_buffer_level;
   p_rc->bits_off_target = p_rc->optimal_buffer_level;
@@ -3386,7 +3408,7 @@
     if (rc->prev_coded_width == cm->width &&
         rc->prev_coded_height == cm->height) {
       rc_scene_detection_onepass_rt(cpi, frame_input);
-    } else if (cpi->src_sad_blk_64x64) {
+    } else {
       aom_free(cpi->src_sad_blk_64x64);
       cpi->src_sad_blk_64x64 = NULL;
     }
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 4fb1179..6802ad4 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -205,6 +205,8 @@
   int decimation_factor;
   int decimation_count;
   int prev_frame_is_dropped;
+  int drop_count_consec;
+  int max_consec_drop;
 
   /*!
    * Frame number for encoded frames (non-dropped).
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 8bc7d1b..c2d76e7 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -809,10 +809,11 @@
 
   // Frame level dv cost update
   if (av1_need_dv_costs(cpi)) {
-    if (cpi->td.mb.dv_costs == NULL) {
+    if (cpi->td.dv_costs_alloc == NULL) {
       CHECK_MEM_ERROR(
-          cm, cpi->td.mb.dv_costs,
-          (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.mb.dv_costs)));
+          cm, cpi->td.dv_costs_alloc,
+          (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc)));
+      cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc;
     }
     av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
   }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8620087..c17fbcc 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1322,7 +1322,8 @@
   const int mi_col = xd->mi_col;
   int mode_index_start, mode_index_end;
   const int txfm_rd_gate_level =
-      get_txfm_rd_gate_level(cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+      get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+                             cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
                              TX_SEARCH_MOTION_MODE, eval_motion_mode);
 
   // Modify the start and end index according to speed features. For example,
@@ -1656,16 +1657,16 @@
     // Call av1_enc_build_inter_predictor() for one plane at a time.
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
                                   plane, plane);
-    const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
 
     av1_subtract_plane(x, plane_bsize, plane);
-    int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
-    sse >>= ((cpi->frame_info.bit_depth - 8) * 2);
+
+    int64_t sse =
+        av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL);
+    if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+    sse <<= 4;
     total_sse += sse;
     // When current rd cost is more than the best rd, skip evaluation of
     // remaining planes.
@@ -2055,6 +2056,9 @@
                                 HandleInterModeArgs *const args,
                                 int64_t ref_best_rd, BLOCK_SIZE bsize,
                                 const int ref_set) {
+  // If the number of ref mv count is equal to 1, do not prune the same. It
+  // is better to evaluate the same than to prune it.
+  if (ref_set == 1) return 1;
   AV1_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -2833,17 +2837,6 @@
   const int base_rate =
       args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
 
-  // As per the experiments, in real-time preset impact of model rd based
-  // breakouts is less on encoding time if the following conditions are true.
-  //    (1) compound mode is disabled
-  //    (2) interpolation filter search is disabled
-  // TODO(any): Check the impact of model rd based breakouts in other presets
-  const int skip_interp_search_modelrd_calc =
-      cpi->oxcf.mode == REALTIME &&
-      cm->current_frame.reference_mode == SINGLE_REFERENCE &&
-      (cpi->sf.rt_sf.skip_interp_filter_search ||
-       cpi->sf.winner_mode_sf.winner_mode_ifs);
-
   for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
     save_mv[i][0].as_int = INVALID_MV;
     save_mv[i][1].as_int = INVALID_MV;
@@ -2993,7 +2986,7 @@
       if (not_best_mode) continue;
     }
 
-    if (!skip_interp_search_modelrd_calc) {
+    if (!args->skip_ifs) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
       start_timing(cpi, interpolation_filter_search_time);
 #endif
@@ -3125,7 +3118,7 @@
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
-      cpi->sf.rt_sf.use_nonrd_pick_mode)
+      !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode)
     return INT64_MAX;
   const int num_planes = av1_num_planes(cm);
 
@@ -3188,12 +3181,14 @@
   av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 
   FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+  const SEARCH_METHODS search_method =
+      av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
   const search_site_config *lookahead_search_sites =
       cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
   const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
   av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
                                      &dv_ref.as_mv, start_mv,
-                                     lookahead_search_sites,
+                                     lookahead_search_sites, search_method,
                                      /*fine_search_interval=*/0);
   const IntraBCMVCosts *const dv_costs = x->dv_costs;
   av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
@@ -3242,9 +3237,11 @@
     const int step_param = cpi->mv_search_params.mv_step_param;
     IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
     int_mv best_mv, best_hash_mv;
+    FULLPEL_MV_STATS best_mv_stats;
 
-    int bestsme = av1_full_pixel_search(start_mv, &fullms_params, step_param,
-                                        NULL, &best_mv.as_fullmv, NULL);
+    int bestsme =
+        av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL,
+                              &best_mv.as_fullmv, &best_mv_stats, NULL);
     const int hashsme = av1_intrabc_hash_search(
         cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
     if (hashsme < bestsme) {
@@ -3780,6 +3777,7 @@
   MB_MODE_INFO *const mbmi = xd->mi[0];
   unsigned char segment_id = mbmi->segment_id;
   const SPEED_FEATURES *const sf = &cpi->sf;
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
   REF_SET ref_set = REF_SET_FULL;
 
   if (sf->rt_sf.use_real_time_ref_set)
@@ -3852,7 +3850,7 @@
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
-    if (sf->inter_sf.alt_ref_search_fp &&
+    if (inter_sf->alt_ref_search_fp &&
         (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
       mask->pred_modes[ALTREF_FRAME] = 0;
       disable_inter_references_except_altref(mask->ref_combo);
@@ -3860,19 +3858,19 @@
     }
   }
 
-  if (sf->inter_sf.alt_ref_search_fp) {
+  if (inter_sf->alt_ref_search_fp) {
     if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) {
       int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3);
       // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
       // those are past frames
       MV_REFERENCE_FRAME start_frame =
-          sf->inter_sf.alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
+          inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
       for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) {
         if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
             0) {
           // Prune inter modes when relative dist of ALTREF2 and ALTREF is close
           // to the relative dist of LAST_FRAME.
-          if (sf->inter_sf.alt_ref_search_fp == 1 &&
+          if (inter_sf->alt_ref_search_fp == 1 &&
               (abs(cpi->ref_frame_dist_info
                        .ref_relative_dist[ref_frame - LAST_FRAME]) >
                1.5 * abs(cpi->ref_frame_dist_info
@@ -3913,6 +3911,33 @@
 
   mask->pred_modes[INTRA_FRAME] |=
       ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
+
+  // Prune reference frames which are not the closest to the current
+  // frame and with large pred_mv_sad.
+  if (inter_sf->prune_single_ref) {
+    assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3);
+    const double prune_threshes[2] = { 1.20, 1.05 };
+
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      const RefFrameDistanceInfo *const ref_frame_dist_info =
+          &cpi->ref_frame_dist_info;
+      const int is_closest_ref =
+          (ref_frame == ref_frame_dist_info->nearest_past_ref) ||
+          (ref_frame == ref_frame_dist_info->nearest_future_ref);
+
+      if (!is_closest_ref) {
+        const int dir =
+            (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+                ? 0
+                : 1;
+        if (x->best_pred_mv_sad[dir] < INT_MAX &&
+            x->pred_mv_sad[ref_frame] >
+                prune_threshes[inter_sf->prune_single_ref - 1] *
+                    x->best_pred_mv_sad[dir])
+          mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL;
+      }
+    }
+  }
 }
 
 static AOM_INLINE void init_neighbor_pred_buf(
@@ -4025,6 +4050,7 @@
       setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
     }
     if (cpi->sf.inter_sf.alt_ref_search_fp ||
+        cpi->sf.inter_sf.prune_single_ref ||
         cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
       // Store the best pred_mv_sad across all past frames
       if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
@@ -4995,8 +5021,16 @@
   *(args->skip_motion_mode) = (ret == 2);
 
   // We've reached the first compound prediction mode, get stats from the
-  // single reference predictors to help with pruning
-  if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+  // single reference predictors to help with pruning.
+  // Disable this pruning logic if interpolation filter search was skipped for
+  // single prediction modes as it can result in aggressive pruning of compound
+  // prediction modes due to the absence of modelled_rd populated by
+  // av1_interpolation_filter_search().
+  // TODO(Remya): Check the impact of the sf
+  // 'prune_comp_search_by_single_result' if compound prediction modes are
+  // enabled in future for REALTIME encode.
+  if (!sf->interp_sf.skip_interp_filter_search &&
+      sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
       args->reach_first_comp_mode == 0) {
     analyze_single_states(cpi, args->search_state);
     args->reach_first_comp_mode = 1;
@@ -5014,7 +5048,8 @@
 
   // Skip this compound mode based on the RD results from the single prediction
   // modes
-  if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+  if (!sf->interp_sf.skip_interp_filter_search &&
+      sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
     if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
                                        ref_frame, second_ref_frame, x))
       return 1;
@@ -5203,6 +5238,7 @@
     const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
     int64_t skip_rd = INT64_MAX;
     const int txfm_rd_gate_level = get_txfm_rd_gate_level(
+        cm->seq_params->enable_masked_compound,
         cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT,
         /*eval_motion_mode=*/0);
     if (txfm_rd_gate_level) {
@@ -5665,6 +5701,20 @@
   }
 }
 
+static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
+                                                 int is_single_pred) {
+  const MODE encoding_mode = cpi->oxcf.mode;
+  if (encoding_mode == REALTIME) {
+    return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE &&
+            (cpi->sf.interp_sf.skip_interp_filter_search ||
+             cpi->sf.winner_mode_sf.winner_mode_ifs));
+  } else if (encoding_mode == GOOD) {
+    // Skip interpolation filter search for single prediction modes.
+    return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred);
+  }
+  return false;
+}
+
 static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
                                          const MACROBLOCK *x,
                                          BLOCK_SIZE bsize) {
@@ -5727,6 +5777,7 @@
                                INT_MAX,
                                search_state.simple_rd,
                                0,
+                               false,
                                interintra_modes,
                                { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
                                { { 0, 0 } },
@@ -5960,6 +6011,7 @@
     args.single_comp_cost = real_compmode_cost;
     args.ref_frame_cost = ref_frame_cost;
     args.best_pred_sse = search_state.best_pred_sse;
+    args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred);
 
     int64_t skip_rd[2] = { search_state.best_skip_rd[0],
                            search_state.best_skip_rd[1] };
@@ -5976,7 +6028,8 @@
     end_timing(cpi, handle_inter_mode_time);
 #endif
     if (current_frame->reference_mode != SINGLE_REFERENCE) {
-      if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+      if (!args.skip_ifs &&
+          sf->inter_sf.prune_comp_search_by_single_result > 0 &&
           is_inter_singleref_mode(this_mode)) {
         collect_single_states(x, &search_state, mbmi);
       }
@@ -6007,6 +6060,10 @@
       update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                           &rd_stats_uv, mode_enum, x, do_tx_search);
       if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+      // skip_rd[0] is the best total rd for a skip mode so far.
+      // skip_rd[1] is the best total rd for a skip mode so far in luma.
+      // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated.
+      // When do_tx_search = 0, skip_rd[1] is updated.
       search_state.best_skip_rd[1] = skip_rd[1];
     }
     if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 1c5b3db..b6bc492 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -774,12 +774,18 @@
 
 // Get transform rd gate level for the given transform search case.
 static INLINE int get_txfm_rd_gate_level(
+    const int is_masked_compound_enabled,
     const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize,
     TX_SEARCH_CASE tx_search_case, int eval_motion_mode) {
   assert(tx_search_case < TX_SEARCH_CASES);
   if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode &&
       num_pels_log2_lookup[bsize] > 8)
     return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE];
+  // Enable aggressive gating of transform search only when masked compound type
+  // is enabled.
+  else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE &&
+           is_masked_compound_enabled)
+    return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE];
 
   return txfm_rd_gate_level[TX_SEARCH_DEFAULT];
 }
diff --git a/av1/encoder/saliency_map.c b/av1/encoder/saliency_map.c
index 3376846..30019bb 100644
--- a/av1/encoder/saliency_map.c
+++ b/av1/encoder/saliency_map.c
@@ -1261,7 +1261,7 @@
     return 0;
   }
 
-  const int bsize = cm->seq_params->sb_size;
+  const BLOCK_SIZE bsize = cm->seq_params->sb_size;
   const int num_mi_w = mi_size_wide[bsize];
   const int num_mi_h = mi_size_high[bsize];
   const int block_width = block_size_wide[bsize];
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c432e42..830d2c6 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -847,6 +847,14 @@
     } else {
       sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
     }
+
+    if (is_720p_or_larger)
+      sf->part_sf.ext_part_eval_based_on_cur_best =
+          (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1;
+
+    if (is_480p_or_larger) {
+      sf->tpl_sf.reduce_num_frames = 1;
+    }
   }
 
   if (speed >= 6) {
@@ -863,6 +871,10 @@
       sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
     }
 
+    if (is_480p_or_larger) {
+      sf->hl_sf.allow_sub_blk_me_in_tf = 1;
+    }
+
     if (is_1080p_or_larger) {
       sf->part_sf.default_min_partition_size = BLOCK_8X8;
     }
@@ -972,6 +984,9 @@
   sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
 
   if (speed >= 1) {
+    sf->hl_sf.adjust_num_frames_for_arf_filtering =
+        allow_screen_content_tools ? 0 : 1;
+
     sf->part_sf.intra_cnn_based_part_prune_level =
         allow_screen_content_tools ? 0 : 2;
     sf->part_sf.simple_motion_search_early_term_none = 1;
@@ -1100,6 +1115,7 @@
     sf->mv_sf.search_method = DIAMOND;
     sf->mv_sf.disable_second_mv = 2;
     sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1;
+    sf->mv_sf.use_intrabc = 0;
 
     sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
     sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
@@ -1173,6 +1189,7 @@
     sf->inter_sf.alt_ref_search_fp = 2;
     sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3;
     sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
     sf->inter_sf.prune_ext_comp_using_neighbors = 2;
@@ -1197,6 +1214,7 @@
 
     sf->tpl_sf.subpel_force_stop = HALF_PEL;
     sf->tpl_sf.search_method = FAST_BIGDIA;
+    sf->tpl_sf.use_sad_for_mode_decision = 1;
 
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
 
@@ -1213,6 +1231,8 @@
 
   if (speed >= 5) {
     sf->hl_sf.weight_calc_level_in_tf = 1;
+    sf->hl_sf.adjust_num_frames_for_arf_filtering =
+        allow_screen_content_tools ? 0 : 2;
 
     sf->fp_sf.reduce_mv_step_param = 4;
 
@@ -1223,15 +1243,18 @@
     sf->part_sf.ext_partition_eval_thresh =
         allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
     sf->part_sf.prune_sub_8x8_partition_level =
-        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
-                                                                          : 2;
+        allow_screen_content_tools ? 1 : 2;
 
     sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND;
 
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
+    sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1;
     sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4;
+    sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5;
     sf->inter_sf.enable_fast_compound_mode_search = 2;
 
+    sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1;
+
     sf->intra_sf.chroma_intra_pruning_with_hog = 3;
 
     // TODO(any): Extend multi-winner mode processing support for inter frames
@@ -1247,6 +1270,7 @@
     sf->tpl_sf.use_y_only_rate_distortion = 1;
     sf->tpl_sf.subpel_force_stop = FULL_PEL;
     sf->tpl_sf.gop_length_decision_method = 2;
+    sf->tpl_sf.use_sad_for_mode_decision = 2;
 
     sf->winner_mode_sf.dc_blk_pred_level = 2;
 
@@ -1256,11 +1280,10 @@
   if (speed >= 6) {
     sf->hl_sf.disable_extra_sc_testing = 1;
     sf->hl_sf.second_alt_ref_filtering = 0;
-    sf->hl_sf.adjust_num_frames_for_arf_filtering =
-        allow_screen_content_tools ? 0 : 1;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
     sf->inter_sf.selective_ref_frame = 6;
+    sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2;
     sf->inter_sf.prune_ext_comp_using_neighbors = 3;
 
     sf->intra_sf.chroma_intra_pruning_with_hog = 4;
@@ -1273,10 +1296,7 @@
 
     sf->part_sf.prune_rectangular_split_based_on_qidx =
         boosted || allow_screen_content_tools ? 0 : 2;
-    sf->part_sf.prune_sub_8x8_partition_level =
-        allow_screen_content_tools          ? 0
-        : frame_is_intra_only(&cpi->common) ? 1
-                                            : 2;
+
     sf->part_sf.prune_part4_search = 3;
 
     sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
@@ -1488,11 +1508,7 @@
         cpi->svc.number_temporal_layers > 1)
       sf->hl_sf.accurate_bit_estimate = 0;
 
-    // TODO(yunqingwang@google.com): test to see if
-    // estimate_motion_for_var_based_partition == 2 helps here.
-    if (sf->rt_sf.estimate_motion_for_var_based_partition == 2)
-      sf->rt_sf.estimate_motion_for_var_based_partition = 1;
-    if (speed >= 9) sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 1;
 
     // For single layers RPS: bias/adjustment for recovery frame.
     if (cpi->ppi->rtc_ref.bias_recovery_frame) {
@@ -1509,6 +1525,8 @@
       sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
       sf->mv_sf.use_bsize_dependent_search_method = 0;
       sf->rt_sf.skip_cdef_sb = 1;
+      sf->rt_sf.increase_color_thresh_palette = 1;
+      if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1;
     }
     if (speed >= 8) {
       sf->rt_sf.nonrd_check_partition_merge_mode = 3;
@@ -1542,6 +1560,7 @@
       sf->rt_sf.part_early_exit_zeromv = 2;
       sf->rt_sf.prune_palette_nonrd = 1;
       sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+      sf->rt_sf.increase_color_thresh_palette = 0;
     }
     sf->rt_sf.use_nonrd_altref_frame = 0;
     sf->rt_sf.use_rtc_tf = 0;
@@ -1565,12 +1584,11 @@
     }
     sf->rt_sf.partition_direct_merging = 0;
     sf->hl_sf.accurate_bit_estimate = 0;
-
-    // "sf->rt_sf.estimate_motion_for_var_based_partition = 2" doesn't work well
-    // for screen contents.
-    if (sf->rt_sf.estimate_motion_for_var_based_partition == 2)
+    // This feature is for nonrd_pickmode and  non-svc for now.
+    if (sf->rt_sf.use_nonrd_pick_mode && !cpi->ppi->use_svc)
       sf->rt_sf.estimate_motion_for_var_based_partition = 1;
-    if (speed >= 9) sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    else
+      sf->rt_sf.estimate_motion_for_var_based_partition = 0;
   }
   if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
     sf->rt_sf.use_rtc_tf = 0;
@@ -1608,6 +1626,7 @@
   sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
   sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
   sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.skip_interp_filter_search = 1;
   sf->part_sf.ml_prune_partition = 1;
   sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
   sf->part_sf.prune_ext_partition_types_search_level = 2;
@@ -1619,7 +1638,6 @@
   // Disable Wiener and Self-guided Loop restoration filters.
   sf->lpf_sf.disable_wiener_filter = true;
   sf->lpf_sf.disable_sgr_filter = true;
-  sf->rt_sf.skip_interp_filter_search = 1;
   sf->intra_sf.prune_palette_search_level = 2;
   sf->intra_sf.prune_luma_palette_size_search_level = 2;
   sf->intra_sf.early_term_chroma_palette_size_search = 1;
@@ -1785,6 +1803,8 @@
     // This sf is not applicable in non-rd path.
     sf->inter_sf.skip_newmv_in_drl = 0;
 
+    sf->interp_sf.skip_interp_filter_search = 0;
+
     // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
     // good. May need more study.
     for (int i = 0; i < TX_SIZES; ++i) {
@@ -1810,7 +1830,6 @@
     sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0);
 #endif
     sf->rt_sf.short_circuit_low_temp_var = 0;
-    sf->rt_sf.skip_interp_filter_search = 0;
     // For spatial layers, only LAST and GOLDEN are currently used in the SVC
     // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
     // get_ref_frame_flags() for some patterns, so disable it here for
@@ -1872,6 +1891,10 @@
     sf->rt_sf.var_part_split_threshold_shift = 10;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
   }
+  if (speed >= 11 && !frame_is_intra_only(cm) &&
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    sf->winner_mode_sf.dc_blk_pred_level = 3;
+  }
 }
 
 static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
@@ -1887,6 +1910,7 @@
   hl_sf->adjust_num_frames_for_arf_filtering = 0;
   hl_sf->accurate_bit_estimate = 0;
   hl_sf->weight_calc_level_in_tf = 0;
+  hl_sf->allow_sub_blk_me_in_tf = 0;
 }
 
 static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
@@ -1907,6 +1931,8 @@
   tpl_sf->prune_ref_frames_in_tpl = 0;
   tpl_sf->allow_compound_pred = 1;
   tpl_sf->use_y_only_rate_distortion = 0;
+  tpl_sf->use_sad_for_mode_decision = 0;
+  tpl_sf->reduce_num_frames = 0;
 }
 
 static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
@@ -1948,6 +1974,7 @@
   part_sf->intra_cnn_based_part_prune_level = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
   part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+  part_sf->ext_part_eval_based_on_cur_best = 0;
   part_sf->prune_ext_part_using_split_info = 0;
   part_sf->prune_rectangular_split_based_on_qidx = 0;
   part_sf->prune_rect_part_using_4x4_var_deviation = false;
@@ -1983,6 +2010,7 @@
   mv_sf->skip_fullpel_search_using_startmv = 0;
   mv_sf->warp_search_method = WARP_SEARCH_SQUARE;
   mv_sf->warp_search_iters = 8;
+  mv_sf->use_intrabc = 1;
 }
 
 static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
@@ -1990,6 +2018,7 @@
   inter_sf->model_based_post_interp_filter_breakout = 0;
   inter_sf->reduce_inter_modes = 0;
   inter_sf->alt_ref_search_fp = 0;
+  inter_sf->prune_single_ref = 0;
   inter_sf->prune_comp_ref_frames = 0;
   inter_sf->selective_ref_frame = 0;
   inter_sf->prune_ref_frame_for_rect_partitions = 0;
@@ -2042,6 +2071,7 @@
   interp_sf->skip_sharp_interp_filter_search = 0;
   interp_sf->use_fast_interpolation_filter_search = 0;
   interp_sf->use_interp_filter = 0;
+  interp_sf->skip_interp_filter_search = 0;
 }
 
 static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
@@ -2139,6 +2169,8 @@
 static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
   lpf_sf->disable_loop_restoration_chroma = 0;
   lpf_sf->disable_loop_restoration_luma = 0;
+  lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+  lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
   lpf_sf->prune_wiener_based_on_src_var = 0;
   lpf_sf->prune_sgr_based_on_wiener = 0;
   lpf_sf->enable_sgr_ep_pruning = 0;
@@ -2172,7 +2204,6 @@
   rt_sf->num_inter_modes_for_tx_search = INT_MAX;
   rt_sf->use_nonrd_filter_search = 0;
   rt_sf->use_simple_rd_model = 0;
-  rt_sf->skip_interp_filter_search = 0;
   rt_sf->hybrid_intra_pickmode = 0;
   rt_sf->source_metrics_sb_nonrd = 0;
   rt_sf->overshoot_detection_cbr = NO_DETECTION;
@@ -2200,6 +2231,7 @@
   rt_sf->use_rtc_tf = 0;
   rt_sf->prune_idtx_nonrd = 0;
   rt_sf->prune_palette_nonrd = 0;
+  rt_sf->dct_only_palette_nonrd = 0;
   rt_sf->part_early_exit_zeromv = 0;
   rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
   rt_sf->skip_lf_screen = 0;
@@ -2222,6 +2254,7 @@
   rt_sf->screen_content_cdef_filter_qindex_thresh = 0;
   rt_sf->enable_ref_short_signaling = false;
   rt_sf->check_globalmv_on_single_ref = true;
+  rt_sf->increase_color_thresh_palette = false;
 }
 
 static fractional_mv_step_fp
@@ -2454,9 +2487,11 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const int boosted = frame_is_boosted(cpi);
+  const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440;
   const int is_arf2_bwd_type =
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
 
@@ -2508,23 +2543,36 @@
 
   if (speed >= 2) {
     // Disable extended partitions for lower quantizers
-    const int aggr = AOMMIN(3, speed - 2);
+    const int aggr = AOMMIN(4, speed - 2);
     const int qindex_thresh1[4] = { 50, 50, 80, 100 };
     const int qindex_thresh2[4] = { 80, 100, 120, 160 };
     int qindex_thresh;
-    int disable_ext_part;
     if (aggr <= 1) {
       const int qthresh2 =
           (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
       qindex_thresh = cm->features.allow_screen_content_tools
                           ? qindex_thresh1[aggr]
                           : qthresh2;
-      disable_ext_part = !boosted;
-    } else {
+      if (cm->quant_params.base_qindex <= qindex_thresh && !boosted)
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    } else if (aggr <= 2) {
       qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
-      disable_ext_part = !frame_is_intra_only(cm);
-    }
-    if (cm->quant_params.base_qindex <= qindex_thresh && disable_ext_part) {
+      if (cm->quant_params.base_qindex <= qindex_thresh &&
+          !frame_is_intra_only(cm))
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    } else if (aggr <= 3) {
+      if (!is_480p_or_larger) {
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+      } else if (!is_720p_or_larger && !frame_is_intra_only(cm) &&
+                 !cm->features.allow_screen_content_tools) {
+        sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+      } else {
+        qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+        if (cm->quant_params.base_qindex <= qindex_thresh &&
+            !frame_is_intra_only(cm))
+          sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+      }
+    } else {
       sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
@@ -2593,6 +2641,45 @@
     }
   }
 
+  if (speed >= 5) {
+    // Disable the sf for low quantizers in case of low resolution screen
+    // contents.
+    if (cm->features.allow_screen_content_tools &&
+        cm->quant_params.base_qindex < 128 && is_480p_or_lesser) {
+      sf->part_sf.prune_sub_8x8_partition_level = 0;
+    }
+  }
+
+  // Loop restoration size search
+  // At speed 0, always search all available sizes for the maximum possible gain
+  sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+  sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+
+  if (speed >= 1) {
+    // For large frames, small restoration units are almost never useful,
+    // so prune them away
+    if (is_1440p_or_larger) {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+    } else if (is_720p_or_larger) {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+    }
+  }
+
+  if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) {
+    // At this speed, a full search is too expensive. Instead, pick a single
+    // size based on size and qindex. Note that, in general, higher quantizers
+    // (== lower quality) and larger frames generally want to use larger
+    // restoration units.
+    int qindex_thresh = 96;
+    if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+      sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+    } else {
+      sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+      sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+    }
+  }
+
   set_subpel_search_method(&cpi->mv_search_params,
                            cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
                            sf->mv_sf.subpel_search_method);
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 27a07c5..14cd874 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -92,6 +92,8 @@
                             (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
                             (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                             (1 << NEAR_NEARMV),
+  INTER_SINGLE_ALL =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV),
 };
 
 enum {
@@ -240,11 +242,14 @@
 } UENUM1BYTE(PRUNE_NEARMV_LEVEL);
 
 enum {
-  // Default Transform search case - used in evaluation of compound type mode
-  // and best inter candidates
+  // Default transform search used in evaluation of best inter candidates
+  // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL
+  // stage).
   TX_SEARCH_DEFAULT = 0,
-  // Transform search in motion mode rd
+  // Transform search in motion mode rd during MODE_EVAL stage.
   TX_SEARCH_MOTION_MODE,
+  // Transform search in compound type mode rd during MODE_EVAL stage.
+  TX_SEARCH_COMP_TYPE_MODE,
   // All transform search cases
   TX_SEARCH_CASES
 } UENUM1BYTE(TX_SEARCH_CASE);
@@ -448,7 +453,11 @@
 
   /*!
    * The number of frames to be used during temporal filtering of an ARF frame
-   * is adjusted based on noise level of the current frame.
+   * is adjusted based on noise level of the current frame. The sf has three
+   * levels to decide number of frames to be considered for filtering:
+   * 0       : Use default number of frames
+   * 1 and 2 : Reduce the number of frames based on noise level with varied
+   * aggressiveness
    */
   int adjust_num_frames_for_arf_filtering;
 
@@ -465,6 +474,14 @@
    * 1: Calculate weight using a lookup table that approximates exp().
    */
   int weight_calc_level_in_tf;
+
+  /*!
+   * Decide whether to perform motion estimation at split block (i.e. 16x16)
+   * level or not.
+   * 0: Always allow motion estimation.
+   * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance.
+   */
+  int allow_sub_blk_me_in_tf;
 } HIGH_LEVEL_SPEED_FEATURES;
 
 /*!
@@ -536,6 +553,19 @@
 
   // Calculate rate and distortion based on Y plane only.
   int use_y_only_rate_distortion;
+
+  // Use SAD instead of SATD during intra/inter mode search.
+  // If set to 0, use SATD always.
+  // If set to 1, use SAD during intra/inter mode search for frames in the
+  // higher temporal layers of the hierarchical prediction structure.
+  // If set to 2, use SAD during intra/inter mode search for all frames.
+  // This sf is disabled for the first GF group of the key-frame interval,
+  // i.e., SATD is used during intra/inter mode search of the first GF group.
+  int use_sad_for_mode_decision;
+
+  // Skip tpl processing for frames of type LF_UPDATE.
+  // This sf is disabled for the first GF group of the key-frame interval.
+  int reduce_num_frames;
 } TPL_SPEED_FEATURES;
 
 typedef struct GLOBAL_MOTION_SPEED_FEATURES {
@@ -567,9 +597,10 @@
   // Used if partition_search_type = FIXED_PARTITION
   BLOCK_SIZE fixed_partition_size;
 
-  // Prune extended partition types search
-  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
-  // aggressiveness of pruning in order.
+  // Prune extended partition types search based on the current best partition
+  // and the combined rdcost of the subblocks estimated from previous
+  // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2
+  // increasing aggressiveness of pruning in order.
   int prune_ext_partition_types_search_level;
 
   // Prune part4 based on block size
@@ -654,13 +685,18 @@
   // 2: Prune none, split and rectangular partitions
   int intra_cnn_based_part_prune_level;
 
-  // Disable extended partition search for lower block sizes.
-  int ext_partition_eval_thresh;
+  // Disable extended partition search if the current bsize is greater than the
+  // threshold. Must be a square block size BLOCK_8X8 or higher.
+  BLOCK_SIZE ext_partition_eval_thresh;
+
+  // Use best partition decision so far to tune 'ext_partition_eval_thresh'
+  int ext_part_eval_based_on_cur_best;
 
   // Disable rectangular partitions for larger block sizes.
   int rect_partition_eval_thresh;
 
-  // prune extended partition search
+  // Prune extended partition search based on whether the split/rect partitions
+  // provided an improvement in the previous search.
   // 0 : no pruning
   // 1 : prune 1:4 partition search using winner info from split partitions
   // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info
@@ -822,6 +858,9 @@
   // Accurate full pixel motion search based on TPL stats.
   int full_pixel_search_level;
 
+  // Allow intrabc motion search
+  int use_intrabc;
+
   // Whether to downsample the rows in sad calculation during motion search.
   // This is only active when there are at least 16 rows. When this sf is
   // active, if there is a large discrepancy in the SAD values for the final
@@ -900,6 +939,12 @@
   // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames
   int alt_ref_search_fp;
 
+  // Prune reference frames for single prediction modes based on temporal
+  // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is
+  // disabled for 0. An increasing value indicates more aggressive pruning
+  // threshold.
+  int prune_single_ref;
+
   // Prune compound reference frames
   // 0 no pruning
   // 1 prune compound references which do not satisfy the two conditions:
@@ -1123,6 +1168,10 @@
 
   // adaptive interp_filter search to allow skip of certain filter types.
   int adaptive_interp_filter_search;
+
+  // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation
+  // filter search.
+  int skip_interp_filter_search;
 } INTERP_FILTER_SPEED_FEATURES;
 
 typedef struct INTRA_MODE_SPEED_FEATURES {
@@ -1441,6 +1490,13 @@
   // Disable loop restoration for luma plane
   int disable_loop_restoration_luma;
 
+  // Range of loop restoration unit sizes to search
+  // The minimum size is clamped against the superblock size in
+  // av1_pick_filter_restoration, so that the code which sets this value does
+  // not need to know the superblock size ahead of time.
+  int min_lr_unit_size;
+  int max_lr_unit_size;
+
   // Prune RESTORE_WIENER evaluation based on source variance
   // 0 : no pruning
   // 1 : conservative pruning
@@ -1541,9 +1597,6 @@
   // Use simplified RD model for interpolation search and Intra
   int use_simple_rd_model;
 
-  // If set forces interpolation filter to EIGHTTAP_REGULAR
-  int skip_interp_filter_search;
-
   // For nonrd mode: use hybrid intra mode search for intra only frames based on
   // block properties.
   // 0 : use nonrd pick intra for all blocks
@@ -1671,6 +1724,9 @@
   // Prune the use of paletter mode in nonrd pickmode.
   int prune_palette_nonrd;
 
+  // Force to only use dct for palette search in nonrd pickmode.
+  int dct_only_palette_nonrd;
+
   // Skip loopfilter, for static content after slide change
   // or key frame, once quality has ramped up.
   // 0: disabled
@@ -1798,6 +1854,11 @@
   // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame
   // case.
   bool check_globalmv_on_single_ref;
+
+  // Allows for increasing the color_threshold for palette prediction.
+  // This generally leads to better coding efficiency but with some speed loss.
+  // Only used for screen content and for nonrd_pickmode.
+  bool increase_color_thresh_palette;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 85678dc..ae0c276 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -33,6 +33,7 @@
   svc->force_zero_mode_spatial_ref = 1;
   svc->num_encoded_top_layer = 0;
   svc->use_flexible_mode = 0;
+  svc->has_lower_quality_layer = 0;
 
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
     for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
@@ -69,7 +70,7 @@
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
-        if (lc->map) aom_free(lc->map);
+        aom_free(lc->map);
         CHECK_MEM_ERROR(cm, lc->map,
                         aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
       }
@@ -155,7 +156,7 @@
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
-        if (lc->map) aom_free(lc->map);
+        aom_free(lc->map);
         CHECK_MEM_ERROR(cm, lc->map,
                         aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
       }
@@ -215,6 +216,7 @@
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
+  const int max_consec_drop = cpi->rc.max_consec_drop;
   // Restore layer rate control.
   cpi->rc = lc->rc;
   cpi->ppi->p_rc = lc->p_rc;
@@ -227,6 +229,8 @@
   // before the layer restore. Keep these defined for the stream (not layer).
   cpi->rc.frames_since_key = old_frame_since_key;
   cpi->rc.frames_to_key = old_frame_to_key;
+  // Reset to value before the layer restore.
+  cpi->rc.max_consec_drop = max_consec_drop;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
   if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
@@ -245,7 +249,8 @@
   // This is to skip searching mv for that reference if it was last
   // refreshed (i.e., buffer slot holding that reference was refreshed) on the
   // previous spatial layer(s) at the same time (current_superframe).
-  if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref) {
+  if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode) {
     if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) {
       svc->skip_mvsearch_last = 1;
     }
@@ -357,7 +362,8 @@
     for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-      if (lc->map) aom_free(lc->map);
+      aom_free(lc->map);
+      lc->map = NULL;
     }
   }
 }
@@ -395,6 +401,16 @@
   int width = 0, height = 0;
   lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
                            svc->temporal_layer_id];
+  // Set the lower quality layer flag.
+  svc->has_lower_quality_layer = 0;
+  if (cpi->svc.spatial_layer_id > 0) {
+    const LAYER_CONTEXT *lc_prev =
+        &svc->layer_context[(svc->spatial_layer_id - 1) *
+                                svc->number_temporal_layers +
+                            svc->temporal_layer_id];
+    if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1)
+      svc->has_lower_quality_layer = 1;
+  }
   av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
                            cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
                            lc->scaling_factor_den, &width, &height);
@@ -499,7 +515,8 @@
       // Set all buffer_idx to 0.
       // Set GOLDEN to slot 5 and update slot 5.
       for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
-      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+          svc->spatial_layer_id < svc->number_spatial_layers - 1) {
         rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5;
         rtc_ref->refresh[5] = 1;
       }
@@ -509,7 +526,8 @@
       // Set LAST3 to slot 6 and update slot 6.
       for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5;
       rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
-      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+          svc->spatial_layer_id < svc->number_spatial_layers - 1) {
         rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6;
         rtc_ref->refresh[6] = 1;
       }
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index 3a6e0fc..bfde33d 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -139,6 +139,14 @@
    * Force zero-mv in mode search for the spatial/inter-layer reference.
    */
   int force_zero_mode_spatial_ref;
+
+  /*!
+   * Flag to indicate that current spatial layer has a lower quality layer
+   * (at the same timestamp) that can be used as a reference.
+   * Lower quality layer refers to the same resolution but encoded at
+   * different/lower bitrate.
+   */
+  int has_lower_quality_layer;
 } SVC;
 
 struct AV1_COMP;
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 91a0c78..d6ae667 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <float.h>
 #include <math.h>
 #include <limits.h>
 
@@ -33,6 +34,7 @@
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/gop_structure.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/pass2_strategy.h"
@@ -49,6 +51,39 @@
 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
                                          MV *subblock_mvs, int *subblock_mses);
 
+// This function returns the minimum and maximum log variances for 4x4 sub
+// blocks in the current block.
+static INLINE void get_log_var_4x4sub_blk(
+    AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row,
+    int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min,
+    double *blk_4x4_var_max, int is_hbd) {
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  int var_min = INT_MAX;
+  int var_max = 0;
+
+  // Derive the source buffer.
+  const int src_stride = frame_to_filter->y_stride;
+  const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width;
+  const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset;
+
+  for (int i = 0; i < mb_height; i += MI_SIZE) {
+    for (int j = 0; j < mb_width; j += MI_SIZE) {
+      // Calculate the 4x4 sub-block variance.
+      const int var = av1_calc_normalized_variance(
+          cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j,
+          src_stride, is_hbd);
+
+      // Record min and max for over-arching block
+      var_min = AOMMIN(var_min, var);
+      var_max = AOMMAX(var_max, var);
+    }
+  }
+
+  *blk_4x4_var_min = log1p(var_min / 16.0);
+  *blk_4x4_var_max = log1p(var_max / 16.0);
+}
+
 /*!\endcond */
 /*!\brief Does motion search for blocks in temporal filtering. This is
  *  the first step for temporal filtering. More specifically, given a frame to
@@ -68,19 +103,22 @@
  *       the entire block.
  *
  * \ingroup src_frame_proc
- * \param[in]   cpi             Top level encoder instance structure
- * \param[in]   mb              Pointer to macroblock
- * \param[in]   frame_to_filter Pointer to the frame to be filtered
- * \param[in]   ref_frame       Pointer to the reference frame
- * \param[in]   block_size      Block size used for motion search
- * \param[in]   mb_row          Row index of the block in the frame
- * \param[in]   mb_col          Column index of the block in the frame
- * \param[in]   ref_mv          Reference motion vector, which is commonly
- *                              inherited from the motion search result of
- *                              previous frame.
- * \param[out]  subblock_mvs    Pointer to the motion vectors for 4 sub-blocks
- * \param[out]  subblock_mses   Pointer to the search errors (MSE) for 4
- *                              sub-blocks
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   mb                    Pointer to macroblock
+ * \param[in]   frame_to_filter       Pointer to the frame to be filtered
+ * \param[in]   ref_frame             Pointer to the reference frame
+ * \param[in]   block_size            Block size used for motion search
+ * \param[in]   mb_row                Row index of the block in the frame
+ * \param[in]   mb_col                Column index of the block in the frame
+ * \param[in]   ref_mv                Reference motion vector, which is commonly
+ *                                    inherited from the motion search result of
+ *                                    previous frame.
+ * \param[in]   allow_me_for_sub_blks Flag to indicate whether motion search at
+ *                                    16x16 sub-block level is needed or not.
+ * \param[out]  subblock_mvs          Pointer to the motion vectors for
+ *                                    4 sub-blocks
+ * \param[out]  subblock_mses         Pointer to the search errors (MSE) for
+ *                                    4 sub-blocks
  *
  * \remark Nothing will be returned. Results are saved in subblock_mvs and
  *         subblock_mses
@@ -89,7 +127,8 @@
                              const YV12_BUFFER_CONFIG *frame_to_filter,
                              const YV12_BUFFER_CONFIG *ref_frame,
                              const BLOCK_SIZE block_size, const int mb_row,
-                             const int mb_col, MV *ref_mv, MV *subblock_mvs,
+                             const int mb_col, MV *ref_mv,
+                             bool allow_me_for_sub_blks, MV *subblock_mvs,
                              int *subblock_mses) {
   // Frame information
   const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
@@ -99,7 +138,10 @@
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
   const int y_stride = frame_to_filter->y_stride;
+  const int src_width = frame_to_filter->y_width;
+  const int ref_width = ref_frame->y_width;
   assert(y_stride == ref_frame->y_stride);
+  assert(src_width == ref_width);
   const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
 
   // Save input state.
@@ -127,8 +169,10 @@
   // Setup.
   mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
   mb->plane[0].src.stride = y_stride;
+  mb->plane[0].src.width = src_width;
   mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
   mbd->plane[0].pre[0].stride = y_stride;
+  mbd->plane[0].pre[0].width = ref_width;
 
   const SEARCH_METHODS search_method = NSTEP;
   const search_site_config *search_site_cfg =
@@ -141,14 +185,15 @@
 
   // Do motion search.
   int_mv best_mv;  // Searched motion vector.
+  FULLPEL_MV_STATS best_mv_stats;
   int block_mse = INT_MAX;
   MV block_mv = kZeroMv;
   const int q = av1_get_q(cpi);
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
                                      &baseline_mv, start_mv, search_site_cfg,
+                                     search_method,
                                      /*fine_search_interval=*/0);
-  av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
   full_ms_params.run_mesh_search = 1;
   full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
 
@@ -160,7 +205,7 @@
 
   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                         cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
-                        NULL);
+                        &best_mv_stats, NULL);
 
   if (force_integer_mv == 1) {  // Only do full search on the entire block.
     const int mv_row = best_mv.as_mv.row;
@@ -181,63 +226,66 @@
     // Since we are merely refining the result from full pixel search, we don't
     // need regularization for subpel search
     ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+    best_mv_stats.err_cost = 0;
 
     MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
     error = cpi->mv_search_params.find_fractional_mv_step(
-        &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
-        &distortion, &sse, NULL);
+        &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats,
+        &best_mv.as_mv, &distortion, &sse, NULL);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
     block_mv = best_mv.as_mv;
     *ref_mv = best_mv.as_mv;
-    // On 4 sub-blocks.
-    const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
-    const int subblock_height = block_size_high[subblock_size];
-    const int subblock_width = block_size_wide[subblock_size];
-    const int subblock_pels = subblock_height * subblock_width;
-    start_mv = get_fullmv_from_mv(ref_mv);
 
-    int subblock_idx = 0;
-    for (int i = 0; i < mb_height; i += subblock_height) {
-      for (int j = 0; j < mb_width; j += subblock_width) {
-        const int offset = i * y_stride + j;
-        mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
-        mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
-        av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
-                                           subblock_size, &baseline_mv,
-                                           start_mv, search_site_cfg,
-                                           /*fine_search_interval=*/0);
-        av1_set_mv_search_method(&full_ms_params, search_site_cfg,
-                                 search_method);
-        full_ms_params.run_mesh_search = 1;
-        full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+    if (allow_me_for_sub_blks) {
+      // On 4 sub-blocks.
+      const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
+      const int subblock_height = block_size_high[subblock_size];
+      const int subblock_width = block_size_wide[subblock_size];
+      const int subblock_pels = subblock_height * subblock_width;
+      start_mv = get_fullmv_from_mv(ref_mv);
 
-        if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
-          // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
-          full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
-          full_ms_params.mesh_search_mv_diff_threshold = 2;
+      int subblock_idx = 0;
+      for (int i = 0; i < mb_height; i += subblock_height) {
+        for (int j = 0; j < mb_width; j += subblock_width) {
+          const int offset = i * y_stride + j;
+          mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+          mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+          av1_make_default_fullpel_ms_params(
+              &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
+              search_site_cfg, search_method,
+              /*fine_search_interval=*/0);
+          full_ms_params.run_mesh_search = 1;
+          full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+          if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+            // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+            full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+            full_ms_params.mesh_search_mv_diff_threshold = 2;
+          }
+          av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                                cond_cost_list(cpi, cost_list),
+                                &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+          av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+                                            &baseline_mv, cost_list);
+          ms_params.forced_stop = EIGHTH_PEL;
+          ms_params.var_params.subpel_search_type = subpel_search_type;
+          // Since we are merely refining the result from full pixel search, we
+          // don't need regularization for subpel search
+          ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+          best_mv_stats.err_cost = 0;
+
+          subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+          assert(
+              av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+          error = cpi->mv_search_params.find_fractional_mv_step(
+              &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+              &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
+          subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+          subblock_mvs[subblock_idx] = best_mv.as_mv;
+          ++subblock_idx;
         }
-
-        av1_full_pixel_search(start_mv, &full_ms_params, step_param,
-                              cond_cost_list(cpi, cost_list),
-                              &best_mv.as_fullmv, NULL);
-
-        av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
-                                          &baseline_mv, cost_list);
-        ms_params.forced_stop = EIGHTH_PEL;
-        ms_params.var_params.subpel_search_type = subpel_search_type;
-        // Since we are merely refining the result from full pixel search, we
-        // don't need regularization for subpel search
-        ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
-
-        subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
-        assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
-        error = cpi->mv_search_params.find_fractional_mv_step(
-            &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
-            &best_mv.as_mv, &distortion, &sse, NULL);
-        subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
-        subblock_mvs[subblock_idx] = best_mv.as_mv;
-        ++subblock_idx;
       }
     }
   }
@@ -247,9 +295,16 @@
   mbd->plane[0].pre[0] = ori_pre_buf;
 
   // Make partition decision.
-  tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
-                               subblock_mses);
-
+  if (allow_me_for_sub_blks) {
+    tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+                                 subblock_mses);
+  } else {
+    // Copy 32X32 block mv and mse values to sub blocks
+    for (int i = 0; i < 4; ++i) {
+      subblock_mvs[i] = block_mv;
+      subblock_mses[i] = block_mse;
+    }
+  }
   // Do not pass down the reference motion vector if error is too large.
   const int thresh = (min_frame_size >= 720) ? 12 : 3;
   if (block_mse > (thresh << (mbd->bd - 8))) {
@@ -842,6 +897,26 @@
     memset(count, 0, num_pels * sizeof(count[0]));
     MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
                           // Perform temporal filtering frame by frame.
+
+    // Decide whether to perform motion search at 16x16 sub-block level or not
+    // based on 4x4 sub-blocks source variance. Allow motion search for split
+    // partition only if the difference between max and min source variance of
+    // 4x4 blocks is greater than a threshold (which is derived empirically).
+    bool allow_me_for_sub_blks = true;
+    if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) {
+      const int is_hbd = is_frame_high_bitdepth(frame_to_filter);
+      // Initialize minimum variance to a large value and maximum variance to 0.
+      double blk_4x4_var_min = DBL_MAX;
+      double blk_4x4_var_max = 0;
+      get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col,
+                             TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max,
+                             is_hbd);
+      // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the
+      // threshold for high bit depth.
+      if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0)
+        allow_me_for_sub_blks = false;
+    }
+
     for (int frame = 0; frame < num_frames; frame++) {
       if (frames[frame] == NULL) continue;
 
@@ -855,7 +930,8 @@
         ref_mv.col *= -1;
       } else {  // Other reference frames.
         tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
-                         mb_row, mb_col, &ref_mv, subblock_mvs, subblock_mses);
+                         mb_row, mb_col, &ref_mv, allow_me_for_sub_blks,
+                         subblock_mvs, subblock_mses);
       }
 
       // Perform weighted averaging.
@@ -887,8 +963,9 @@
                 filter_strength, weight_calc_level_in_tf, pred, accum, count);
 #if CONFIG_AV1_HIGHBITDEPTH
           }
-#endif            // CONFIG_AV1_HIGHBITDEPTH
-        } else {  // for 8-bit
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+        } else {
+          // for 8-bit
           if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
             av1_apply_temporal_filter(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
@@ -1047,22 +1124,32 @@
   // change the number of frames for key frame filtering, which is to avoid
   // visual quality drop.
   int adjust_num = 6;
+  const int adjust_num_frames_for_arf_filtering =
+      cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering;
   if (num_frames == 1) {  // `arnr_max_frames = 1` is used to disable filtering.
     adjust_num = 0;
   } else if ((update_type == KF_UPDATE) && q <= 10) {
     adjust_num = 0;
-  } else if (cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering &&
-             update_type != KF_UPDATE) {
+  } else if (adjust_num_frames_for_arf_filtering > 0 &&
+             update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) {
+    // Since screen content detection happens after temporal filtering,
+    // 'frames_since_key' check is added to ensure the sf is disabled for the
+    // first alt-ref frame.
     // Adjust number of frames to be considered for filtering based on noise
     // level of the current frame. For low-noise frame, use more frames to
     // filter such that the filtered frame can provide better predictions for
     // subsequent frames and vice versa.
+    const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 },
+                                                           { 4, 2, 0 } };
+    const uint8_t *adjust_num_frames =
+        av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1];
+
     if (noise_levels[AOM_PLANE_Y] < 0.5)
-      adjust_num = 4;
+      adjust_num = adjust_num_frames[0];
     else if (noise_levels[AOM_PLANE_Y] < 1.0)
-      adjust_num = 2;
+      adjust_num = adjust_num_frames[1];
     else
-      adjust_num = 0;
+      adjust_num = adjust_num_frames[2];
   }
   num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
 
@@ -1166,11 +1253,11 @@
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-double av1_highbd_estimate_noise_from_single_plane(const uint16_t *src16,
-                                                   int height, int width,
-                                                   const int stride,
-                                                   int bit_depth,
-                                                   int edge_thresh) {
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+                                                     int height, int width,
+                                                     const int stride,
+                                                     int bit_depth,
+                                                     int edge_thresh) {
   int64_t accum = 0;
   int count = 0;
   for (int i = 1; i < height - 1; ++i) {
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 8aa4731..0b00c88 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -21,9 +21,9 @@
 struct AV1_COMP;
 struct AV1EncoderConfig;
 struct ThreadData;
-// TODO(any): These two variables are only used in avx2, sse2, sse4
-// implementations, where the block size is still hard coded. This should be
-// fixed to align with the c implementation.
+// TODO(wtc): These two variables are only used in avx2, sse2, neon
+// implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
+// This should be fixed to align with the c implementation.
 #define BH 32
 #define BW 32
 
@@ -261,6 +261,9 @@
 #endif  // CONFIG_MULTITHREAD
   // Next temporal filter block row to be filtered.
   int next_tf_row;
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool tf_mt_exit;
 } AV1TemporalFilterSync;
 
 // Estimates noise level from a given frame using a single plane (Y, U, or V).
@@ -353,29 +356,26 @@
 //   num_pels: Number of pixels in the block across all planes.
 //   is_high_bitdepth: Whether the frame is high-bitdepth or not.
 // Returns:
-//   Nothing will be returned. But the contents of tf_data will be modified.
+//   True if allocation is successful and false otherwise.
 static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
                                                int num_pels,
                                                int is_high_bitdepth) {
-  tf_data->tmp_mbmi = (MB_MODE_INFO *)malloc(sizeof(*tf_data->tmp_mbmi));
-  memset(tf_data->tmp_mbmi, 0, sizeof(*tf_data->tmp_mbmi));
+  tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi));
   tf_data->accum =
       (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
   tf_data->count =
       (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
-  memset(&tf_data->diff, 0, sizeof(tf_data->diff));
   if (is_high_bitdepth)
     tf_data->pred = CONVERT_TO_BYTEPTR(
         aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
   else
     tf_data->pred =
         (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
-  if (!(tf_data->accum && tf_data->count && tf_data->pred)) {
-    aom_free(tf_data->accum);
-    aom_free(tf_data->count);
-    aom_free(tf_data->pred);
+  // In case of an allocation failure, other successfully allocated buffers will
+  // be freed by the tf_dealloc_data() call in encoder_destroy().
+  if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred))
     return false;
-  }
+  memset(&tf_data->diff, 0, sizeof(tf_data->diff));
   return true;
 }
 
@@ -405,10 +405,14 @@
                                        int is_high_bitdepth) {
   if (is_high_bitdepth)
     tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
-  free(tf_data->tmp_mbmi);
+  aom_free(tf_data->tmp_mbmi);
+  tf_data->tmp_mbmi = NULL;
   aom_free(tf_data->accum);
+  tf_data->accum = NULL;
   aom_free(tf_data->count);
+  tf_data->count = NULL;
   aom_free(tf_data->pred);
+  tf_data->pred = NULL;
 }
 
 // Saves the state prior to temporal filter process.
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 3aeb511..ca60e49 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -252,13 +252,15 @@
 static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
                                   uint8_t *cur_frame_buf,
                                   uint8_t *ref_frame_buf, int stride,
-                                  int stride_ref, BLOCK_SIZE bsize,
-                                  MV center_mv, int_mv *best_mv) {
+                                  int ref_stride, int width, int ref_width,
+                                  BLOCK_SIZE bsize, MV center_mv,
+                                  int_mv *best_mv) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
   int step_param;
   uint32_t bestsme = UINT_MAX;
+  FULLPEL_MV_STATS best_mv_stats;
   int distortion;
   uint32_t sse;
   int cost_list[5];
@@ -267,28 +269,29 @@
   // Setup frame pointers
   x->plane[0].src.buf = cur_frame_buf;
   x->plane[0].src.stride = stride;
+  x->plane[0].src.width = width;
   xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride_ref;
+  xd->plane[0].pre[0].stride = ref_stride;
+  xd->plane[0].pre[0].width = ref_width;
 
   step_param = tpl_sf->reduce_first_step_size;
   step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
   const search_site_config *search_site_cfg =
       cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
-  if (search_site_cfg->stride != stride_ref)
+  if (search_site_cfg->stride != ref_stride)
     search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
-  assert(search_site_cfg->stride == stride_ref);
+  assert(search_site_cfg->stride == ref_stride);
 
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
                                      start_mv, search_site_cfg,
+                                     tpl_sf->search_method,
                                      /*fine_search_interval=*/0);
-  av1_set_mv_search_method(&full_ms_params, search_site_cfg,
-                           tpl_sf->search_method);
 
   bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                                   cond_cost_list(cpi, cost_list),
-                                  &best_mv->as_fullmv, NULL);
+                                  &best_mv->as_fullmv, &best_mv_stats, NULL);
 
   // When sub-pel motion search is skipped, populate sub-pel precision MV and
   // return.
@@ -303,11 +306,12 @@
   ms_params.forced_stop = tpl_sf->subpel_force_stop;
   ms_params.var_params.subpel_search_type = USE_2_TAPS;
   ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+  best_mv_stats.err_cost = 0;
   MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
   assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
   bestsme = cpi->mv_search_params.find_fractional_mv_step(
-      xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &distortion, &sse,
-      NULL);
+      xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv,
+      &distortion, &sse, NULL);
 
   return bestsme;
 }
@@ -451,8 +455,74 @@
   }
 }
 
+static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
+                                         const uint8_t *src_mb_buffer,
+                                         int src_stride,
+                                         TplBuffers *tpl_tmp_buffers,
+                                         BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                         int mi_row, int mi_col, int rf_idx,
+                                         MV *rfidx_mv, int use_pred_sad) {
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
+  const YV12_BUFFER_CONFIG *const ref_frame_ptr =
+      tpl_data->src_ref_frame[rf_idx];
+  int16_t *src_diff = tpl_tmp_buffers->src_diff;
+  tran_low_t *coeff = tpl_tmp_buffers->coeff;
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  int32_t inter_cost;
+
+  if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) {
+    const int_interpfilters kernel =
+        av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+    uint8_t *predictor =
+        is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+                              ref_frame_ptr->y_stride };
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          &tpl_data->sf, &ref_buf, kernel);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv,
+                                      &inter_pred_params);
+
+    if (use_pred_sad) {
+      inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride,
+                                                    predictor, bw);
+    } else {
+      inter_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            predictor, bw, coeff, bw, bh, tx_size);
+    }
+  } else {
+    int ref_mb_offset =
+        mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+    uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+    int ref_stride = ref_frame_ptr->y_stride;
+    const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv);
+    // Since sub-pel motion search is not performed, use the prediction pixels
+    // directly from the reference block ref_mb
+    if (use_pred_sad) {
+      inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(
+          src_mb_buffer, src_stride,
+          &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride);
+    } else {
+      inter_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            &ref_mb[fullmv.row * ref_stride + fullmv.col],
+                            ref_stride, coeff, bw, bh, tx_size);
+    }
+  }
+  return inter_cost;
+}
+
 static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
                                        TplTxfmStats *tpl_txfm_stats,
+                                       TplBuffers *tpl_tmp_buffers,
                                        MACROBLOCK *x, int mi_row, int mi_col,
                                        BLOCK_SIZE bsize, TX_SIZE tx_size,
                                        TplDepStats *tpl_stats) {
@@ -470,8 +540,6 @@
 
   const int bw = 4 << mi_size_wide_log2[bsize];
   const int bh = 4 << mi_size_high_log2[bsize];
-  const int_interpfilters kernel =
-      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
   int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
 
@@ -479,9 +547,11 @@
   int32_t intra_cost;
   PREDICTION_MODE best_mode = DC_PRED;
 
-  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
   uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
-  int src_stride = xd->cur_buf->y_stride;
+  const int src_stride = xd->cur_buf->y_stride;
+  const int src_width = xd->cur_buf->y_width;
 
   int dst_mb_offset =
       mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
@@ -507,29 +577,16 @@
     pd->subsampling_y = xd->cur_buf->subsampling_y;
   }
 
-  // Number of pixels in a tpl block
-  const int tpl_block_pels = tpl_data->tpl_bsize_1d * tpl_data->tpl_bsize_1d;
-  // Allocate temporary buffers used in motion estimation.
-  uint8_t *predictor8 = aom_memalign(32, tpl_block_pels * 2 * sizeof(uint8_t));
-  int16_t *src_diff = aom_memalign(32, tpl_block_pels * sizeof(int16_t));
-  tran_low_t *coeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
-  tran_low_t *qcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
-  tran_low_t *dqcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+  int16_t *src_diff = tpl_tmp_buffers->src_diff;
+  tran_low_t *coeff = tpl_tmp_buffers->coeff;
+  tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff;
+  tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff;
   uint8_t *predictor =
       is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
   int64_t recon_error = 1;
   int64_t pred_error = 1;
 
-  if (!(predictor8 && src_diff && coeff && qcoeff && dqcoeff)) {
-    aom_free(predictor8);
-    aom_free(src_diff);
-    aom_free(coeff);
-    aom_free(qcoeff);
-    aom_free(dqcoeff);
-    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
-                       "Error allocating tpl data");
-  }
-
   memset(tpl_stats, 0, sizeof(*tpl_stats));
   tpl_stats->ref_frame_index[0] = -1;
   tpl_stats->ref_frame_index[1] = -1;
@@ -576,15 +633,32 @@
                             tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
                             dst_buffer_stride, predictor, bw, 0, 0, 0);
 
-    intra_cost =
-        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
-                          predictor, bw, coeff, bw, bh, tx_size);
+    if (tpl_frame->use_pred_sad) {
+      intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf(
+          src_mb_buffer, src_stride, predictor, bw);
+    } else {
+      intra_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            predictor, bw, coeff, bw, bh, tx_size);
+    }
 
     if (intra_cost < best_intra_cost) {
       best_intra_cost = intra_cost;
       best_mode = mode;
     }
   }
+  // Calculate SATD of the best intra mode if SAD was used for mode decision
+  // as best_intra_cost is used in ML model to skip intra mode evaluation.
+  if (tpl_frame->use_pred_sad) {
+    av1_predict_intra_block(
+        xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+        block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0,
+        0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0,
+        0, 0);
+    best_intra_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
+  }
 
   int rate_cost = 1;
 
@@ -653,10 +727,11 @@
     }
 
     const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
-    int ref_mb_offset =
+    const int ref_mb_offset =
         mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
     uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
-    int ref_stride = ref_frame_ptr->y_stride;
+    const int ref_stride = ref_frame_ptr->y_stride;
+    const int ref_width = ref_frame_ptr->y_width;
 
     int_mv best_rfidx_mv = { 0 };
     uint32_t bestsme = UINT32_MAX;
@@ -743,9 +818,9 @@
 
     for (idx = 0; idx < refmv_count; ++idx) {
       int_mv this_mv;
-      uint32_t thissme = motion_estimation(cpi, x, src_mb_buffer, ref_mb,
-                                           src_stride, ref_stride, bsize,
-                                           center_mvs[idx].mv.as_mv, &this_mv);
+      uint32_t thissme = motion_estimation(
+          cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width,
+          ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv);
 
       if (thissme < bestsme) {
         bestsme = thissme;
@@ -756,32 +831,10 @@
     tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
     single_mv[rf_idx] = best_rfidx_mv;
 
-    if (tpl_sf->subpel_force_stop != FULL_PEL) {
-      struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
-                                ref_frame_ptr->y_width, ref_frame_ptr->y_height,
-                                ref_frame_ptr->y_stride };
-      InterPredParams inter_pred_params;
-      av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
-                            mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd),
-                            0, &tpl_data->sf, &ref_buf, kernel);
-      inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
-
-      av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
-                                        &inter_pred_params);
-
-      inter_cost =
-          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
-                            predictor, bw, coeff, bw, bh, tx_size);
-    } else {
-      const FULLPEL_MV best_fullmv = get_fullmv_from_mv(&best_rfidx_mv.as_mv);
-      // Since sub-pel motion search is not performed, use the prediction pixels
-      // directly from the reference block ref_mb
-      inter_cost = tpl_get_satd_cost(
-          bd_info, src_diff, bw, src_mb_buffer, src_stride,
-          &ref_mb[best_fullmv.row * ref_stride + best_fullmv.col], ref_stride,
-          coeff, bw, bh, tx_size);
-    }
-    // Store inter cost for each ref frame
+    inter_cost = get_inter_cost(
+        cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+        mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad);
+    // Store inter cost for each ref frame. This is used to prune inter modes.
     tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
 
     if (inter_cost < best_inter_cost) {
@@ -791,6 +844,14 @@
       best_mv[0].as_int = best_rfidx_mv.as_int;
     }
   }
+  // Calculate SATD of the best inter mode if SAD was used for mode decision
+  // as best_inter_cost is used in ML model to skip intra mode evaluation.
+  if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) {
+    assert(best_rf_idx != -1);
+    best_inter_cost = get_inter_cost(
+        cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+        mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */);
+  }
 
   if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) {
     best_mode = NEWMV;
@@ -841,6 +902,8 @@
   xd->mi_row = mi_row;
   xd->mi_col = mi_col;
   int best_cmp_rf_idx = -1;
+  const int_interpfilters kernel =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) {
     int rf_idx0 = comp_ref_frames[cmp_rf_idx][0];
     int rf_idx1 = comp_ref_frames[cmp_rf_idx][1];
@@ -1039,13 +1102,6 @@
       }
     }
   }
-
-  // Free temporary buffers.
-  aom_free(predictor8);
-  aom_free(src_diff);
-  aom_free(coeff);
-  aom_free(qcoeff);
-  aom_free(dqcoeff);
 }
 
 static int round_floor(int ref_pos, int bsize_pix) {
@@ -1231,9 +1287,10 @@
   const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
   uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
   const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
   int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
       gf_group, cpi->sf.inter_sf.selective_ref_frame,
-      cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx);
+      tpl_sf->prune_ref_frames_in_tpl, frame_idx);
   int gop_length = get_gop_length(gf_group);
   int ref_frame_flags;
   AV1_COMMON *cm = &cpi->common;
@@ -1341,18 +1398,27 @@
   av1_init_tpl_txfm_stats(tpl_txfm_stats);
 
   // Initialize x->mbmi_ext when compound predictions are enabled.
-  if (cpi->sf.tpl_sf.allow_compound_pred) av1_zero(x->mbmi_ext);
+  if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext);
 
   // Set the pointer to null since mbmi is only allocated inside this function.
   assert(xd->mi == &mbmi_ptr);
   xd->mi = NULL;
+
+  // Tpl module is called before the setting of speed features at frame level.
+  // Thus, turning off this speed feature for key frame is done here and not
+  // integrated into the speed feature setting itself.
+  const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0;
+  tpl_frame->use_pred_sad =
+      tpl_sf->use_sad_for_mode_decision &&
+      gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->layer_depth[frame_idx] >= layer_depth_th;
 }
 
 // This function stores the motion estimation dependencies of all the blocks in
 // a row
 void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
-                               MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize,
-                               TX_SIZE tx_size) {
+                               TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+                               int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) {
   AV1_COMMON *const cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
@@ -1372,6 +1438,17 @@
        mi_col += mi_width, tplb_col_in_tile++) {
     (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
                                  tplb_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+    if (mt_info->num_workers > 1) {
+      pthread_mutex_lock(tpl_row_mt->mutex_);
+      const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit;
+      pthread_mutex_unlock(tpl_row_mt->mutex_);
+      // Exit in case any worker has encountered an error.
+      if (tpl_mt_exit) return;
+    }
+#endif
+
     TplDepStats tpl_stats;
 
     // Motion estimation column boundary
@@ -1380,8 +1457,8 @@
     xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
     xd->mb_to_right_edge =
         GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
-    mode_estimation(cpi, tpl_txfm_stats, x, mi_row, mi_col, bsize, tx_size,
-                    &tpl_stats);
+    mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col,
+                    bsize, tx_size, &tpl_stats);
 
     // Motion flow dependency dispenser.
     tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
@@ -1408,8 +1485,8 @@
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, x, mi_row, bsize,
-                              tx_size);
+    av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x,
+                              mi_row, bsize, tx_size);
   }
 }
 
@@ -1703,6 +1780,34 @@
   }
 }
 
+static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group,
+                                         int frame_idx, int gop_eval,
+                                         int approx_gop_eval,
+                                         int reduce_num_frames) {
+  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+  // tpl stats calculation is limited to ARFs from base layer and (base+1)
+  // layer.
+  const int num_arf_layers = (gop_eval == 2) ? 3 : 2;
+  const int gop_length = get_gop_length(gf_group);
+
+  if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+    return 1;
+
+  // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+  // frames and for frames beyond gop length.
+  if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                          frame_idx >= gop_length))
+    return 1;
+
+  if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE &&
+      frame_idx < gop_length)
+    return 1;
+
+  return 0;
+}
+
 int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
                         const EncodeFrameParams *const frame_params) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -1716,13 +1821,6 @@
   EncodeFrameParams this_frame_params = *frame_params;
   TplParams *const tpl_data = &cpi->ppi->tpl_data;
   int approx_gop_eval = (gop_eval > 1);
-  int num_arf_layers = MAX_ARF_LAYERS;
-
-  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
-  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
-  // tpl stats calculation is limited to ARFs from base layer and (base+1)
-  // layer.
-  if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2;
 
   if (cpi->superres_mode != AOM_SUPERRES_NONE) {
     assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
@@ -1751,6 +1849,12 @@
 
   av1_init_tpl_stats(tpl_data);
 
+  TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers;
+  if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating tpl data");
+  }
+
   tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
   tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
 
@@ -1763,20 +1867,26 @@
   av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
                     cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
 
-  const int gop_length = get_gop_length(gf_group);
   const int num_planes =
       cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm);
+  // As tpl module is called before the setting of speed features at frame
+  // level, turning off this speed feature for the first GF group of the
+  // key-frame interval is done here.
+  int reduce_num_frames =
+      cpi->sf.tpl_sf.reduce_num_frames &&
+      gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->max_layer_depth > 2;
+  // TPL processing is skipped for frames of type LF_UPDATE when
+  // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor
+  // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the
+  // frames in the gf group on an average.
+  tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0;
+
   // Backward propagation from tpl_group_frames to 1.
   for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
        ++frame_idx) {
-    if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
-        gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
-      continue;
-
-    // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
-    // frames and for frames beyond gop length.
-    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
-                            frame_idx >= gop_length))
+    if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+                           reduce_num_frames))
       continue;
 
     init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
@@ -1806,12 +1916,8 @@
 
   for (int frame_idx = tpl_gf_group_frames - 1;
        frame_idx >= cpi->gf_frame_index; --frame_idx) {
-    if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
-        gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
-      continue;
-
-    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
-                            frame_idx >= gop_length))
+    if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+                           reduce_num_frames))
       continue;
 
     mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
@@ -1831,6 +1937,8 @@
     end_timing(cpi, av1_tpl_setup_stats_time);
 #endif
 
+  tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+
   if (!approx_gop_eval) {
     tpl_data->ready = 1;
   }
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 36c3ae0..bcd5821 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -24,6 +24,7 @@
 struct EncodeFrameParams;
 struct EncodeFrameInput;
 struct GF_GROUP;
+struct ThreadData;
 struct TPL_INFO;
 
 #include "config/aom_config.h"
@@ -70,6 +71,13 @@
 } AV1TplRowMultiThreadSync;
 
 typedef struct AV1TplRowMultiThreadInfo {
+  // Initialized to false, set to true by the worker thread that encounters an
+  // error in order to abort the processing of other worker threads.
+  bool tpl_mt_exit;
+#if CONFIG_MULTITHREAD
+  // Mutex lock object used for error handling.
+  pthread_mutex_t *mutex_;
+#endif
   // Row synchronization related function pointers.
   void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
   void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
@@ -103,6 +111,14 @@
   int coeff_num;
 } TplTxfmStats;
 
+typedef struct {
+  uint8_t *predictor8;
+  int16_t *src_diff;
+  tran_low_t *coeff;
+  tran_low_t *qcoeff;
+  tran_low_t *dqcoeff;
+} TplBuffers;
+
 typedef struct TplDepStats {
   int64_t srcrf_sse;
   int64_t srcrf_dist;
@@ -137,6 +153,8 @@
   int mi_cols;
   int base_rdmult;
   uint32_t frame_display_index;
+  // When set, SAD metric is used for intra and inter mode decision.
+  int use_pred_sad;
 } TplDepFrame;
 
 /*!\endcond */
@@ -227,6 +245,10 @@
    */
   int border_in_pixels;
 
+  /*!
+   * Factor to adjust r0 if TPL uses a subset of frames in the gf group.
+   */
+  double r0_adjust_factor;
 } TplParams;
 
 #if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
@@ -393,6 +415,45 @@
                            CommonModeInfoParams *const mi_params, int width,
                            int height, int byte_alignment, int lag_in_frames);
 
+static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
+  aom_free(tpl_tmp_buffers->predictor8);
+  tpl_tmp_buffers->predictor8 = NULL;
+  aom_free(tpl_tmp_buffers->src_diff);
+  tpl_tmp_buffers->src_diff = NULL;
+  aom_free(tpl_tmp_buffers->coeff);
+  tpl_tmp_buffers->coeff = NULL;
+  aom_free(tpl_tmp_buffers->qcoeff);
+  tpl_tmp_buffers->qcoeff = NULL;
+  aom_free(tpl_tmp_buffers->dqcoeff);
+  tpl_tmp_buffers->dqcoeff = NULL;
+}
+
+static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
+                                              uint8_t tpl_bsize_1d) {
+  // Number of pixels in a tpl block
+  const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d;
+
+  // Allocate temporary buffers used in mode estimation.
+  tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign(
+      32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8));
+  tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff));
+  tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff));
+  tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff));
+  tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign(
+      32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff));
+
+  if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff &&
+        tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff &&
+        tpl_tmp_buffers->dqcoeff)) {
+    tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+    return false;
+  }
+  return true;
+}
+
 /*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
  * group) and selects between 16 and 32 frame GOP structure.
  *
@@ -424,7 +485,8 @@
                              BLOCK_SIZE sb_size, int mi_row, int mi_col);
 
 void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
-                               TplTxfmStats *tpl_txfm_stats, MACROBLOCK *x,
+                               TplTxfmStats *tpl_txfm_stats,
+                               TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
                                int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
 /*!\brief  Compute the entropy of an exponential probability distribution
diff --git a/av1/encoder/tune_butteraugli.c b/av1/encoder/tune_butteraugli.c
index 8f59373..92fc4b2 100644
--- a/av1/encoder/tune_butteraugli.c
+++ b/av1/encoder/tune_butteraugli.c
@@ -220,8 +220,11 @@
         cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
         cm->features.byte_alignment, 0, 0);
   }
-  av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth,
-                                           av1_num_planes(cm));
+  if (!av1_resize_and_extend_frame_nonnormative(
+          cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating buffers during resize");
+  }
 
   zero_img(cpi->source);
   copy_img(resized_dst, cpi->source, width / resize_factor,
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 9c7c112..4e5ffa3 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -42,6 +42,7 @@
 
   // Parameters used for motion search.
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  FULLPEL_MV_STATS best_mv_stats;
   const SEARCH_METHODS search_method = NSTEP;
   const search_site_config *search_site_cfg =
       cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
@@ -64,10 +65,11 @@
   // Only do full search on the entire block.
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
                                      &baseline_mv, *ref_mv, search_site_cfg,
+                                     search_method,
                                      /*fine_search_interval=*/0);
-  av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
   av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
-                        cond_cost_list(cpi, cost_list), ref_mv, NULL);
+                        cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats,
+                        NULL);
 
   // Restore input state.
   mb->plane[0].src = ori_src_buf;
@@ -621,8 +623,11 @@
       &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
       ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment, 0, 0);
-  av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
-                                           bit_depth, av1_num_planes(cm));
+  if (!av1_resize_and_extend_frame_nonnormative(
+          cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating buffers during resize");
+  }
 
   const int resized_y_width = resized_source.y_width;
   const int resized_y_height = resized_source.y_height;
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index d6217b7..7292c01 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -119,13 +119,10 @@
   *rd_stats = mb_rd_info->rd_stats;
 }
 
-// Compute the pixel domain distortion from diff on all visible 4x4s in the
-// transform block.
-static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
-                                      int blk_row, int blk_col,
-                                      const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize,
-                                      unsigned int *block_mse_q8) {
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+                            int blk_col, const BLOCK_SIZE plane_bsize,
+                            const BLOCK_SIZE tx_bsize,
+                            unsigned int *block_mse_q8) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -188,7 +185,7 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
 
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+  *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
 
   const int64_t mse = *dist / bw / bh;
   // Normalized quantizer takes the transform upscaling factor (8 for tx size
@@ -243,7 +240,7 @@
 
 // Used to set proper context for early termination with skip = 1.
 static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
-                                     int bsize, int64_t dist) {
+                                     BLOCK_SIZE bsize, int64_t dist) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int n4 = bsize_to_num_blk(bsize);
@@ -644,7 +641,7 @@
         get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
-    if (x->skip_chroma_rd && plane) continue;
+    if (plane) continue;
 
     cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
                             pd->dst.stride, &sse);
@@ -2030,12 +2027,9 @@
   uint16_t best_eob = 0;
   TX_TYPE best_tx_type = DCT_DCT;
   int rate_cost = 0;
-  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
-  // of the best tx_type
-  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
   struct macroblock_plane *const p = &x->plane[plane];
   tran_low_t *orig_dqcoeff = p->dqcoeff;
-  tran_low_t *best_dqcoeff = this_dqcoeff;
+  tran_low_t *best_dqcoeff = x->dqcoeff_buf;
   const int tx_type_map_idx =
       plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
   av1_invalid_rd_stats(best_rd_stats);
@@ -2071,8 +2065,8 @@
       return;
     }
   } else {
-    block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
-                                txsize_to_bsize[tx_size], &block_mse_q8);
+    block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                    txsize_to_bsize[tx_size], &block_mse_q8);
     assert(block_mse_q8 != UINT_MAX);
   }
 
@@ -2080,7 +2074,7 @@
   uint16_t tx_mask;
 
   // Use DCT_DCT transform for DC only block.
-  if (dc_only_blk)
+  if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1)
     tx_mask = 1 << DCT_DCT;
   else
     tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index b3689cf..ed95c1c 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -47,6 +47,27 @@
   return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 }
 
+/*!\brief Compute the pixel domain distortion.
+ *
+ * \ingroup transform_search
+ * Compute the pixel domain distortion from diff on all visible 4x4s in the
+ * transform block.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          Plane index
+ * \param[in]    blk_row        Block row index
+ * \param[in]    blk_col        Block col index
+ * \param[in]    plane_bsize    Current plane block size
+ * \param[in]    tx_bsize       Transform size
+ * \param[in]    block_mse_q8   Block mse
+ * \return       An int64_t value that is the block sse.
+ */
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+                            int blk_col, const BLOCK_SIZE plane_bsize,
+                            const BLOCK_SIZE tx_bsize,
+                            unsigned int *block_mse_q8);
+
 int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                               RD_STATS *rd_stats, int64_t ref_best_rd,
                               BLOCK_SIZE bs, TX_SIZE tx_size);
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 5b8f598..d545600 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -29,6 +29,7 @@
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/var_based_part.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/rdopt_utils.h"
 
 // Possible values for the force_split variable while evaluating variance based
 // partitioning.
@@ -1021,8 +1022,7 @@
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
       cpi->rc.high_source_sad) {
     shift_lower_limit = 7;
-  } else if (source_sad_nonrd >= kMedSad &&
-             cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+  } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 &&
              cpi->common.width * cpi->common.height >= 640 * 360) {
     shift_upper_limit = 2;
     shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4;
@@ -1243,6 +1243,7 @@
     *y_sad = *y_sad_g;
     *ref_frame_partition = GOLDEN_FRAME;
     x->nonrd_prune_ref_frame_search = 0;
+    x->sb_me_partition = 0;
   } else if (is_set_altref_ref_frame) {
     av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
                          get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
@@ -1251,6 +1252,7 @@
     *y_sad = *y_sad_alt;
     *ref_frame_partition = ALTREF_FRAME;
     x->nonrd_prune_ref_frame_search = 0;
+    x->sb_me_partition = 0;
   } else {
     *ref_frame_partition = LAST_FRAME;
     x->nonrd_prune_ref_frame_search =
@@ -1339,11 +1341,14 @@
 static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
                          unsigned int *y_sad_g, unsigned int *y_sad_alt,
                          unsigned int *y_sad_last,
-                         MV_REFERENCE_FRAME *ref_frame_partition, int mi_row,
+                         MV_REFERENCE_FRAME *ref_frame_partition,
+                         struct scale_factors *sf_no_scale, int mi_row,
                          int mi_col, bool is_small_sb, bool scaled_ref_last) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
+  bool scaled_ref_golden = false;
+  bool scaled_ref_alt = false;
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
   MB_MODE_INFO *mi = xd->mi[0];
   const YV12_BUFFER_CONFIG *yv12 =
@@ -1361,21 +1366,22 @@
                     cpi->sf.rt_sf.use_nonrd_altref_frame ||
                     (cpi->sf.rt_sf.use_comp_ref_nonrd &&
                      cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1);
-  // On a resized frame (reference has different scale) only use
-  // LAST as reference for partitioning for now.
-  if (scaled_ref_last) {
-    use_golden_ref = 0;
-    use_alt_ref = 0;
-  }
 
   // For 1 spatial layer: GOLDEN is another temporal reference.
   // Check if it should be used as reference for partitioning.
   if (cpi->svc.number_spatial_layers == 1 && use_golden_ref &&
       (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
     yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+    if (yv12_g && (yv12_g->y_crop_height != cm->height ||
+                   yv12_g->y_crop_width != cm->width)) {
+      yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      scaled_ref_golden = true;
+    }
     if (yv12_g && yv12_g != yv12) {
-      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+      av1_setup_pre_planes(
+          xd, 0, yv12_g, mi_row, mi_col,
+          scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME),
+          num_planes);
       *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
           x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
           xd->plane[AOM_PLANE_Y].pre[0].buf,
@@ -1389,9 +1395,16 @@
       (cpi->ref_frame_flags & AOM_ALT_FLAG) &&
       (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
     yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+    if (yv12_alt && (yv12_alt->y_crop_height != cm->height ||
+                     yv12_alt->y_crop_width != cm->width)) {
+      yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME);
+      scaled_ref_alt = true;
+    }
     if (yv12_alt && yv12_alt != yv12) {
-      av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
-                           get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
+      av1_setup_pre_planes(
+          xd, 0, yv12_alt, mi_row, mi_col,
+          scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME),
+          num_planes);
       *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf(
           x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
           xd->plane[AOM_PLANE_Y].pre[0].buf,
@@ -1420,9 +1433,37 @@
 
     if (est_motion == 1 || est_motion == 2) {
       if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
-        const MV dummy_mv = { 0, 0 };
-        *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
-                                               mi_row, mi_col, &dummy_mv);
+        // For screen only do int_pro_motion for spatial variance above
+        // threshold and motion level above LowSad.
+        if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
+          int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+          int me_search_size_col =
+              is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1;
+          // For screen use larger search size row motion to capture
+          // vertical scroll, which can be larger motion.
+          int me_search_size_row =
+              is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1;
+          unsigned int y_sad_zero;
+          *y_sad = av1_int_pro_motion_estimation(
+              cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
+              &y_sad_zero, me_search_size_col, me_search_size_row);
+          // The logic below selects whether the motion estimated in the
+          // int_pro_motion() will be used in nonrd_pickmode. Only do this
+          // for screen for now.
+          if (is_screen) {
+            unsigned int thresh_sad =
+                (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
+            if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
+              x->sb_me_partition = 1;
+              x->sb_me_mv.as_int = mi->mv[0].as_int;
+            } else {
+              x->sb_me_partition = 0;
+              // Fall back to using zero motion.
+              *y_sad = y_sad_zero;
+              mi->mv[0].as_int = 0;
+            }
+          }
+        }
       }
     }
 
@@ -1450,7 +1491,12 @@
 
   // Only calculate the predictor for non-zero MV.
   if (mi->mv[0].as_int != 0) {
-    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    if (!scaled_ref_last) {
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    } else {
+      xd->block_ref_scale_factors[0] = sf_no_scale;
+      xd->block_ref_scale_factors[1] = sf_no_scale;
+    }
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
                                   cm->seq_params->sb_size, AOM_PLANE_Y,
                                   num_planes - 1);
@@ -1517,8 +1563,8 @@
       uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) {
     set_block_size(cpi, mi_row, mi_col, bsize);
     x->force_zeromv_skip_for_sb = 1;
-    if (vt2) aom_free(vt2);
-    if (vt) aom_free(vt);
+    aom_free(vt2);
+    aom_free(vt);
     // Partition shape is set here at SB level.
     // Exit needs to happen from av1_choose_var_based_partitioning().
     return true;
@@ -1558,6 +1604,9 @@
   NOISE_LEVEL noise_level = kLow;
   bool is_zero_motion = true;
   bool scaled_ref_last = false;
+  struct scale_factors sf_no_scale;
+  av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+                                    cm->width, cm->height);
 
   bool is_key_frame =
       (frame_is_intra_only(cm) ||
@@ -1578,7 +1627,7 @@
   // Ref frame used in partitioning.
   MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
 
-  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
+  AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt)));
 
   vt->split = td->vt64x64;
 
@@ -1645,10 +1694,19 @@
     }
   }
 
+  x->source_variance = UINT_MAX;
+  // For nord_pickmode: compute source_variance, only for superblocks with
+  // some motion for now. This input can then be used to bias the partitioning
+  // or the chroma_check.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      x->content_state_sb.source_sad_nonrd > kLowSad)
+    x->source_variance = av1_get_perpixel_variance_facade(
+        cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y);
+
   if (!is_key_frame) {
     setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last,
-                 &ref_frame_partition, mi_row, mi_col, is_small_sb,
-                 scaled_ref_last);
+                 &ref_frame_partition, &sf_no_scale, mi_row, mi_col,
+                 is_small_sb, scaled_ref_last);
 
     MB_MODE_INFO *mi = xd->mi[0];
     // Use reference SB directly for zero mv.
@@ -1690,8 +1748,14 @@
   if (cpi->noise_estimate.enabled)
     noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
 
-  if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
+  if (low_res && threshold_4x4avg < INT64_MAX) {
+    vt2 = aom_malloc(sizeof(*vt2));
+    if (!vt2) {
+      aom_free(vt);
+      aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+                         "Error allocating partition buffer vt2");
+    }
+  }
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16,
@@ -1869,8 +1933,8 @@
                           ref_frame_partition, mi_col, mi_row, is_small_sb);
   }
 
-  if (vt2) aom_free(vt2);
-  if (vt) aom_free(vt);
+  aom_free(vt2);
+  aom_free(vt);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, choose_var_based_partitioning_time);
 #endif
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c
index ad0b374..52ddc66 100644
--- a/av1/encoder/x86/av1_k_means_avx2.c
+++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -10,7 +10,7 @@
  */
 #include <immintrin.h>  // AVX2
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 #include "aom_dsp/x86/synonyms.h"
 
 static int64_t k_means_horizontal_sum_avx2(__m256i a) {
diff --git a/av1/encoder/x86/av1_k_means_sse2.c b/av1/encoder/x86/av1_k_means_sse2.c
index 4338bf7..6c75822 100644
--- a/av1/encoder/x86/av1_k_means_sse2.c
+++ b/av1/encoder/x86/av1_k_means_sse2.c
@@ -11,7 +11,7 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 #include "aom_dsp/x86/synonyms.h"
 
 static int64_t k_means_horizontal_sum_sse2(__m128i a) {
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c
index 9e06ebe..ebe7531 100644
--- a/av1/encoder/x86/hash_sse42.c
+++ b/av1/encoder/x86/hash_sse42.c
@@ -12,6 +12,8 @@
 #include <stdint.h>
 #include <smmintrin.h>
 
+#include "config/av1_rtcd.h"
+
 // Byte-boundary alignment issues
 #define ALIGN_SIZE 8
 #define ALIGN_MASK (ALIGN_SIZE - 1)
diff --git a/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/av1/encoder/x86/highbd_block_error_intrin_avx2.c
index ee3714d..340307c 100644
--- a/av1/encoder/x86/highbd_block_error_intrin_avx2.c
+++ b/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include "aom/aom_integer.h"
 #include "av1/common/common.h"
+#include "config/av1_rtcd.h"
 
 int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff,
                                     const tran_low_t *dqcoeff,
diff --git a/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
index 0287f01..b0b2757 100644
--- a/av1/encoder/x86/highbd_block_error_intrin_sse2.c
+++ b/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 
 #include "av1/common/common.h"
+#include "config/av1_rtcd.h"
 
 int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
                                     const tran_low_t *dqcoeff,
diff --git a/av1/encoder/x86/ml_avx2.c b/av1/encoder/x86/ml_avx2.c
new file mode 100644
index 0000000..6432708
--- /dev/null
+++ b/av1/encoder/x86/ml_avx2.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+#define CALC_OUTPUT_FOR_2ROWS                                               \
+  const int index = weight_idx + (2 * i * tot_num_inputs);                  \
+  const __m256 weight0 = _mm256_loadu_ps(&weights[index]);                  \
+  const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \
+  const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);                    \
+  const __m256 mul1 = _mm256_mul_ps(inputs256, weight1);                    \
+  hadd[i] = _mm256_hadd_ps(mul0, mul1);
+
+static INLINE void nn_propagate_8to1(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    int num_outputs, float *const output_nodes, int is_clip_required) {
+  // Process one output row at a time.
+  for (int out = 0; out < num_outputs; out++) {
+    __m256 in_result = _mm256_setzero_ps();
+    float bias_val = bias[out];
+    for (int in = 0; in < num_inputs_to_process; in += 8) {
+      const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+      const int weight_idx = in + (out * tot_num_inputs);
+      const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]);
+      const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);
+      in_result = _mm256_add_ps(in_result, mul0);
+    }
+    const __m128 low_128 = _mm256_castps256_ps128(in_result);
+    const __m128 high_128 = _mm256_extractf128_ps(in_result, 1);
+    const __m128 sum_par_0 = _mm_add_ps(low_128, high_128);
+    const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0);
+    const __m128 sum_tot =
+        _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1);
+
+    bias_val += _mm_cvtss_f32(sum_tot);
+    if (is_clip_required) bias_val = AOMMAX(bias_val, 0);
+    output_nodes[out] = bias_val;
+  }
+}
+
+static INLINE void nn_propagate_8to4(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    int num_outputs, float *const output_nodes, int is_clip_required) {
+  __m256 hadd[2];
+  for (int out = 0; out < num_outputs; out += 4) {
+    __m128 bias_reg = _mm_loadu_ps(&bias[out]);
+    __m128 in_result = _mm_setzero_ps();
+    for (int in = 0; in < num_inputs_to_process; in += 8) {
+      const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+      const int weight_idx = in + (out * tot_num_inputs);
+      // Process two output row at a time.
+      for (int i = 0; i < 2; i++) {
+        CALC_OUTPUT_FOR_2ROWS
+      }
+
+      const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]);
+      const __m128 low_128 = _mm256_castps256_ps128(sum_par);
+      const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1);
+      const __m128 result = _mm_add_ps(low_128, high_128);
+
+      in_result = _mm_add_ps(in_result, result);
+    }
+
+    in_result = _mm_add_ps(in_result, bias_reg);
+    if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps());
+    _mm_storeu_ps(&output_nodes[out], in_result);
+  }
+}
+
+static INLINE void nn_propagate_8to8(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    int num_outputs, float *const output_nodes, int is_clip_required) {
+  __m256 hadd[4];
+  for (int out = 0; out < num_outputs; out += 8) {
+    __m256 bias_reg = _mm256_loadu_ps(&bias[out]);
+    __m256 in_result = _mm256_setzero_ps();
+    for (int in = 0; in < num_inputs_to_process; in += 8) {
+      const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+      const int weight_idx = in + (out * tot_num_inputs);
+      // Process two output rows at a time.
+      for (int i = 0; i < 4; i++) {
+        CALC_OUTPUT_FOR_2ROWS
+      }
+      const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]);
+      const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]);
+
+      __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20);
+      __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31);
+
+      __m256 result = _mm256_add_ps(ht_0, ht_1);
+      in_result = _mm256_add_ps(in_result, result);
+    }
+    in_result = _mm256_add_ps(in_result, bias_reg);
+    if (is_clip_required)
+      in_result = _mm256_max_ps(in_result, _mm256_setzero_ps());
+    _mm256_storeu_ps(&output_nodes[out], in_result);
+  }
+}
+
+static INLINE void nn_propagate_input_multiple_of_8(
+    const float *const inputs, const float *const weights,
+    const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+    bool is_output_layer, int num_outputs, float *const output_nodes) {
+  // The saturation of output is considered for hidden layer which is not equal
+  // to final hidden layer.
+  const int is_clip_required =
+      !is_output_layer && num_inputs_to_process == tot_num_inputs;
+  if (num_outputs % 8 == 0) {
+    nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process,
+                      tot_num_inputs, num_outputs, output_nodes,
+                      is_clip_required);
+  } else if (num_outputs % 4 == 0) {
+    nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process,
+                      tot_num_inputs, num_outputs, output_nodes,
+                      is_clip_required);
+  } else {
+    nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process,
+                      tot_num_inputs, num_outputs, output_nodes,
+                      is_clip_required);
+  }
+}
+
+void av1_nn_predict_avx2(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+  assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER);
+
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool is_output_layer = layer == nn_config->num_hidden_layers;
+    float *const output_nodes = is_output_layer ? output : &buf[buf_index][0];
+    const int num_outputs = is_output_layer
+                                ? nn_config->num_outputs
+                                : nn_config->num_hidden_nodes[layer];
+    assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER);
+
+    // Process input multiple of 8 using AVX2 intrinsic.
+    if (num_inputs % 8 == 0) {
+      nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias,
+                                       num_inputs, num_inputs, is_output_layer,
+                                       num_outputs, output_nodes);
+    } else {
+      // When number of inputs is not multiple of 8, use hybrid approach of AVX2
+      // and SSE3 based on the need.
+      const int in_mul_8 = num_inputs / 8;
+      const int num_inputs_to_process = in_mul_8 * 8;
+      int bias_is_considered = 0;
+      if (in_mul_8) {
+        nn_propagate_input_multiple_of_8(
+            input_nodes, layer_weights, layer_bias, num_inputs_to_process,
+            num_inputs, is_output_layer, num_outputs, output_nodes);
+        bias_is_considered = 1;
+      }
+
+      const float *out_temp = bias_is_considered ? output_nodes : layer_bias;
+      const int input_remaining = num_inputs % 8;
+      if (input_remaining % 4 == 0 && num_outputs % 8 == 0) {
+        for (int out = 0; out < num_outputs; out += 8) {
+          __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]);
+          __m128 out_l = _mm_loadu_ps(&out_temp[out]);
+          for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+            av1_nn_propagate_4to8_sse3(&input_nodes[in],
+                                       &layer_weights[out * num_inputs + in],
+                                       &out_h, &out_l, num_inputs);
+          }
+          if (!is_output_layer) {
+            const __m128 zero = _mm_setzero_ps();
+            out_h = _mm_max_ps(out_h, zero);
+            out_l = _mm_max_ps(out_l, zero);
+          }
+          _mm_storeu_ps(&output_nodes[out + 4], out_h);
+          _mm_storeu_ps(&output_nodes[out], out_l);
+        }
+      } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) {
+        for (int out = 0; out < num_outputs; out += 4) {
+          __m128 outputs = _mm_loadu_ps(&out_temp[out]);
+          for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+            av1_nn_propagate_4to4_sse3(&input_nodes[in],
+                                       &layer_weights[out * num_inputs + in],
+                                       &outputs, num_inputs);
+          }
+          if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+          _mm_storeu_ps(&output_nodes[out], outputs);
+        }
+      } else if (input_remaining % 4 == 0) {
+        for (int out = 0; out < num_outputs; out++) {
+          __m128 outputs = _mm_load1_ps(&out_temp[out]);
+          for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+            av1_nn_propagate_4to1_sse3(&input_nodes[in],
+                                       &layer_weights[out * num_inputs + in],
+                                       &outputs);
+          }
+          if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+          output_nodes[out] = _mm_cvtss_f32(outputs);
+        }
+      } else {
+        // Use SSE instructions for scalar operations to avoid the latency
+        // of swapping between SIMD and FPU modes.
+        for (int out = 0; out < num_outputs; out++) {
+          __m128 outputs = _mm_load1_ps(&out_temp[out]);
+          for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) {
+            __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+            __m128 weight =
+                _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+            outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight));
+          }
+          if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+          output_nodes[out] = _mm_cvtss_f32(outputs);
+        }
+      }
+    }
+    // Before processing the next layer, treat the output of current layer as
+    // input to next layer.
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index ab69088..4748a68 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c
@@ -11,10 +11,10 @@
 
 #include <stdbool.h>
 #include <assert.h>
-#include <pmmintrin.h>
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
 
 // In order to avoid the high-latency of swapping between FPU and SIMD
 // operations, we keep the result in a 128-bit register even though we only
@@ -41,9 +41,9 @@
   *output = _mm_add_ps(*output, hadd2);
 }
 
-static void nn_propagate_4to1(const float *const inputs,
-                              const float *const weights,
-                              __m128 *const output) {
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const output) {
   const __m128 inputs128 = _mm_loadu_ps(inputs);
 
   const __m128 weights128 = _mm_loadu_ps(weights);
@@ -58,9 +58,9 @@
   *output = _mm_add_ps(*output, hadd2);
 }
 
-static void nn_propagate_4to4(const float *const inputs,
-                              const float *const weights, __m128 *const outputs,
-                              const int num_inputs) {
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const outputs, const int num_inputs) {
   const __m128 inputs128 = _mm_loadu_ps(inputs);
 
   __m128 hadd[2];
@@ -80,9 +80,9 @@
   *outputs = _mm_add_ps(*outputs, hh);
 }
 
-static void nn_propagate_4to8(const float *const inputs,
-                              const float *const weights, __m128 *const out_h,
-                              __m128 *const out_l, const int num_inputs) {
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+                                const float *const weights, __m128 *const out_h,
+                                __m128 *const out_l, const int num_inputs) {
   const __m128 inputs128 = _mm_loadu_ps(inputs);
 
   __m128 hadd[4];
@@ -171,9 +171,9 @@
         __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
         __m128 out_l = _mm_loadu_ps(&layer_bias[out]);
         for (int in = 0; in < num_inputs; in += 4) {
-          nn_propagate_4to8(&input_nodes[in],
-                            &layer_weights[out * num_inputs + in], &out_h,
-                            &out_l, num_inputs);
+          av1_nn_propagate_4to8_sse3(&input_nodes[in],
+                                     &layer_weights[out * num_inputs + in],
+                                     &out_h, &out_l, num_inputs);
         }
         if (!output_layer) nn_activate8(&out_h, &out_l);
         _mm_storeu_ps(&output_nodes[out + 4], out_h);
@@ -194,9 +194,9 @@
       for (int out = 0; out < num_outputs; out += 4) {
         __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
         for (int in = 0; in < num_inputs; in += 4) {
-          nn_propagate_4to4(&input_nodes[in],
-                            &layer_weights[out * num_inputs + in], &outputs,
-                            num_inputs);
+          av1_nn_propagate_4to4_sse3(&input_nodes[in],
+                                     &layer_weights[out * num_inputs + in],
+                                     &outputs, num_inputs);
         }
         if (!output_layer) nn_activate4(&outputs);
         _mm_storeu_ps(&output_nodes[out], outputs);
@@ -215,8 +215,8 @@
       for (int out = 0; out < num_outputs; out++) {
         __m128 total = _mm_load1_ps(&layer_bias[out]);
         for (int in = 0; in < num_inputs; in += 4) {
-          nn_propagate_4to1(&input_nodes[in],
-                            &layer_weights[out * num_inputs + in], &total);
+          av1_nn_propagate_4to1_sse3(
+              &input_nodes[in], &layer_weights[out * num_inputs + in], &total);
         }
         if (!output_layer) nn_activate4(&total);
         output_nodes[out] = _mm_cvtss_f32(total);
diff --git a/av1/encoder/x86/ml_sse3.h b/av1/encoder/x86/ml_sse3.h
new file mode 100644
index 0000000..f41a247
--- /dev/null
+++ b/av1/encoder/x86/ml_sse3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_
+#define AOM_AV1_ENCODER_X86_ML_SSE3_H_
+
+#include <pmmintrin.h>
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const output);
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+                                const float *const weights,
+                                __m128 *const outputs, const int num_inputs);
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+                                const float *const weights, __m128 *const out_h,
+                                __m128 *const out_l, const int num_inputs);
+
+#endif  // AOM_AV1_ENCODER_X86_ML_SSE3_H_
diff --git a/av1/ratectrl_rtc.cc b/av1/ratectrl_rtc.cc
index a3ec6f6..62d6e74 100644
--- a/av1/ratectrl_rtc.cc
+++ b/av1/ratectrl_rtc.cc
@@ -39,6 +39,8 @@
   undershoot_pct = overshoot_pct = 50;
   max_intra_bitrate_pct = 50;
   max_inter_bitrate_pct = 0;
+  frame_drop_thresh = 0;
+  max_consec_drop = 0;
   framerate = 30.0;
   ss_number_layers = 1;
   ts_number_layers = 1;
@@ -124,7 +126,8 @@
   oxcf->pass = AOM_RC_ONE_PASS;
   oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
   oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT;
-  oxcf->rc_cfg.drop_frames_water_mark = 0;
+  oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  rc->max_consec_drop = rc_cfg.max_consec_drop;
   oxcf->tool_cfg.bit_depth = AOM_BITS_8;
   oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC;
   oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL;
@@ -185,9 +188,15 @@
   oxcf->rc_cfg.maximum_buffer_size_ms = rc_cfg.buf_sz;
   oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  rc->max_consec_drop = rc_cfg.max_consec_drop;
   oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
   oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
+  if (rc_cfg.is_screen) {
+    cpi_->oxcf.tune_cfg.content = AOM_CONTENT_SCREEN;
+    cpi_->is_screen_content_type = 1;
+  }
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
   set_primary_rc_buffer_sizes(oxcf, cpi_->ppi);
@@ -226,7 +235,8 @@
   return true;
 }
 
-void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
+FrameDropDecision AV1RateControlRTC::ComputeQP(
+    const AV1FrameParamsRTC &frame_params) {
   AV1_COMMON *const cm = &cpi_->common;
   int width, height;
   GF_GROUP *const gf_group = &cpi_->ppi->gf_group;
@@ -292,14 +302,25 @@
     }
   }
   av1_rc_set_frame_target(cpi_, target, cm->width, cm->height);
-
-  int bottom_index, top_index;
+  // Always drop for spatial enhancement layer if layer bandwidth is 0.
+  // Otherwise check for frame-dropping based on buffer level in
+  // av1_rc_drop_frame().
+  if ((cpi_->svc.spatial_layer_id > 0 &&
+       cpi_->oxcf.rc_cfg.target_bandwidth == 0) ||
+      av1_rc_drop_frame(cpi_)) {
+    cpi_->is_dropped_frame = true;
+    av1_rc_postencode_update_drop_frame(cpi_);
+    cpi_->frame_index_set.show_frame_count++;
+    cpi_->common.current_frame.frame_number++;
+    return FrameDropDecision::kDrop;
+  }
+  int bottom_index = 0, top_index = 0;
   cpi_->common.quant_params.base_qindex =
       av1_rc_pick_q_and_bounds(cpi_, cm->width, cm->height,
                                cpi_->gf_frame_index, &bottom_index, &top_index);
-
   if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
     av1_cyclic_refresh_setup(cpi_);
+  return FrameDropDecision::kOk;
 }
 
 int AV1RateControlRTC::GetQP() const {
@@ -327,12 +348,17 @@
   return cdef_level;
 }
 
-signed char *AV1RateControlRTC::GetCyclicRefreshMap() const {
-  return cpi_->cyclic_refresh->map;
-}
-
-int *AV1RateControlRTC::GetDeltaQ() const {
-  return cpi_->cyclic_refresh->qindex_delta;
+bool AV1RateControlRTC::GetSegmentationData(
+    AV1SegmentationData *segmentation_data) const {
+  if (cpi_->oxcf.q_cfg.aq_mode == 0) {
+    return false;
+  }
+  segmentation_data->segmentation_map = cpi_->enc_seg.map;
+  segmentation_data->segmentation_map_size =
+      cpi_->common.mi_params.mi_rows * cpi_->common.mi_params.mi_cols;
+  segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+  segmentation_data->delta_q_size = 3u;
+  return true;
 }
 
 void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
diff --git a/av1/ratectrl_rtc.h b/av1/ratectrl_rtc.h
index e96e210..1894469 100644
--- a/av1/ratectrl_rtc.h
+++ b/av1/ratectrl_rtc.h
@@ -33,7 +33,7 @@
   int width;
   int height;
   // Flag indicating if the content is screen or not.
-  bool is_screen;
+  bool is_screen = false;
   // 0-63
   int max_quantizer;
   int min_quantizer;
@@ -45,6 +45,8 @@
   int overshoot_pct;
   int max_intra_bitrate_pct;
   int max_inter_bitrate_pct;
+  int frame_drop_thresh;
+  int max_consec_drop;
   double framerate;
   int layer_target_bitrate[kAV1MaxLayers];
   int ts_rate_decimator[kAV1MaxTemporalLayers];
@@ -77,6 +79,18 @@
   int damping;
 };
 
+struct AV1SegmentationData {
+  const uint8_t *segmentation_map;
+  size_t segmentation_map_size;
+  const int *delta_q;
+  size_t delta_q_size;
+};
+
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
 class AV1RateControlRTC {
  public:
   static std::unique_ptr<AV1RateControlRTC> Create(
@@ -90,9 +104,13 @@
   AV1LoopfilterLevel GetLoopfilterLevel() const;
   // GetCdefInfo() needs to be called after ComputeQP()
   AV1CdefInfo GetCdefInfo() const;
-  signed char *GetCyclicRefreshMap() const;
-  int *GetDeltaQ() const;
-  void ComputeQP(const AV1FrameParamsRTC &frame_params);
+  // Returns the segmentation map used for cyclic refresh, based on 4x4 blocks.
+  bool GetSegmentationData(AV1SegmentationData *segmentation_data) const;
+  // ComputeQP returns the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called (av1_rc_postencode_update_drop_frame is already
+  // called via ComputeQP if drop is decided).
+  FrameDropDecision ComputeQP(const AV1FrameParamsRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);
 
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 5058022..85390d5 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -29,9 +29,14 @@
 set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
 set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
 
-# ARM feature flags.
-set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.")
+# Arm/AArch64 feature flags.
+set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
 set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.")
+set_aom_detect_var(HAVE_NEON_DOTPROD 0
+                   "Enables Armv8.2-A Neon dotprod intrinsics optimizations.")
+set_aom_detect_var(HAVE_NEON_I8MM 0
+                   "Enables Armv8.2-A Neon i8mm intrinsics optimizations.")
+set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.")
 
 # PPC feature flags.
 set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
@@ -189,8 +194,18 @@
 set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
                    OFF)
 
-# ARM assembly/intrinsics flags.
-set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
+# Arm/AArch64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_NEON
+                   "Enables Neon optimizations on Arm/AArch64 targets." ON)
+set_aom_option_var(ENABLE_ARM_CRC32 "Enables Arm CRC32 optimizations." ON)
+set_aom_option_var(
+  ENABLE_NEON_DOTPROD
+  "Enables Armv8.2-A Neon dotprod optimizations on AArch64 targets." ON)
+set_aom_option_var(
+  ENABLE_NEON_I8MM
+  "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON)
+set_aom_option_var(ENABLE_SVE
+                   "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON)
 
 # VSX intrinsics flags.
 set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index aaef2c3..6c932e8 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -13,8 +13,6 @@
 endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
 set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1)
 
-include(FindGit)
-include(FindPerl)
 include(FindThreads)
 
 include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
@@ -214,7 +212,6 @@
 endif()
 
 if(CONFIG_ANALYZER)
-  include(FindwxWidgets)
   find_package(wxWidgets REQUIRED adv base core)
   include(${wxWidgets_USE_FILE})
 endif()
@@ -361,7 +358,7 @@
 
     # This combination has more stack overhead, so we account for it by
     # providing higher stack limit than usual.
-    add_c_flag_if_supported("-Wstack-usage=170000")
+    add_c_flag_if_supported("-Wstack-usage=285000")
     add_cxx_flag_if_supported("-Wstack-usage=270000")
   elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
     add_c_flag_if_supported("-Wstack-usage=135000")
diff --git a/build/cmake/aom_install.cmake b/build/cmake/aom_install.cmake
index b02c7b9..2c263e9 100644
--- a/build/cmake/aom_install.cmake
+++ b/build/cmake/aom_install.cmake
@@ -46,12 +46,12 @@
               -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
               -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
               -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+              -DCMAKE_THREAD_LIBS_INIT=${CMAKE_THREAD_LIBS_INIT}
               -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
               -DCONFIG_TUNE_VMAF=${CONFIG_TUNE_VMAF}
               -DCONFIG_TUNE_BUTTERAUGLI=${CONFIG_TUNE_BUTTERAUGLI}
               -DCONFIG_SALIENCY_MAP=${CONFIG_SALIENCY_MAP}
               -DCONFIG_TFLITE=${CONFIG_TFLITE}
-              -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H}
               -P
               "${AOM_ROOT}/build/cmake/pkg_config.cmake"
       COMMENT "Writing aom.pc"
diff --git a/build/cmake/aom_optimization.cmake b/build/cmake/aom_optimization.cmake
index 6b0c55a..0f93228 100644
--- a/build/cmake/aom_optimization.cmake
+++ b/build/cmake/aom_optimization.cmake
@@ -270,7 +270,7 @@
             --arch=${AOM_TARGET_CPU}
             --sym=${symbol} ${AOM_RTCD_FLAGS}
             --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output}
-    DEPENDS ${config}
+    DEPENDS "${AOM_ROOT}/build/cmake/rtcd.pl" ${config}
     COMMENT "Generating ${output}"
     WORKING_DIRECTORY ${AOM_CONFIG_DIR}
     VERBATIM)
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 799a313..a9b7a67 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -9,11 +9,60 @@
 # can obtain it at www.aomedia.org/license/patent.
 #
 
-if("${AOM_TARGET_CPU}" MATCHES "^arm")
+if("${AOM_TARGET_CPU}" STREQUAL "arm64")
   set(AOM_ARCH_ARM 1)
-  if("${AOM_TARGET_CPU}" STREQUAL "arm64")
-    set(AOM_ARCH_AARCH64 1)
+  set(AOM_ARCH_AARCH64 1)
+  set(RTCD_ARCH_ARM "yes")
+
+  set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE")
+  set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc")
+  set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod")
+  set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm")
+  set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
+
+  # Check that the compiler flag to enable each flavor is supported by the
+  # compiler. This may not be the case for new architecture features on old
+  # compiler versions.
+  foreach(flavor ${ARM64_FLAVORS})
+    if(ENABLE_${flavor} AND NOT DEFINED AOM_${flavor}_FLAG)
+      set(AOM_${flavor}_FLAG "${AOM_${flavor}_DEFAULT_FLAG}")
+      unset(FLAG_SUPPORTED)
+      check_c_compiler_flag("${AOM_${flavor}_FLAG}" FLAG_SUPPORTED)
+      if(NOT ${FLAG_SUPPORTED})
+        set(ENABLE_${flavor} 0)
+      endif()
+    endif()
+  endforeach()
+
+  # SVE requires that the Neon-SVE bridge header is also available.
+  if(ENABLE_SVE)
+    set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}")
+    aom_check_source_compiles("arm_neon_sve_bridge_available" "
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>" HAVE_SVE_HEADERS)
+    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+    if(HAVE_SVE_HEADERS EQUAL 0)
+      set(ENABLE_SVE 0)
+    endif()
   endif()
+
+  foreach(flavor ${ARM64_FLAVORS})
+    if(ENABLE_${flavor})
+      set(HAVE_${flavor} 1)
+      set(RTCD_HAVE_${flavor} "yes")
+    else()
+      set(HAVE_${flavor} 0)
+      string(TOLOWER ${flavor} flavor)
+      set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+    endif()
+  endforeach()
+
+elseif("${AOM_TARGET_CPU}" MATCHES "^arm")
+  set(AOM_ARCH_ARM 1)
   set(RTCD_ARCH_ARM "yes")
 
   if(ENABLE_NEON)
@@ -24,18 +73,6 @@
     set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
   endif()
 
-  check_c_source_compiles("
-    #if !defined(__ARM_FEATURE_CRC32) || __ARM_FEATURE_CRC32 != 1
-    #error \"CRC32 is unavailable.\"
-    #endif
-    int main(void) { return 0; }" HAVE_CRC32)
-  if(HAVE_CRC32)
-    set(HAVE_ARM_CRC32 1)
-  else()
-    set(HAVE_ARM_CRC32 0)
-    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-arm_crc32)
-  endif()
-
 elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
   set(AOM_ARCH_PPC 1)
   set(RTCD_ARCH_PPC "yes")
diff --git a/build/cmake/pkg_config.cmake b/build/cmake/pkg_config.cmake
index e8fff2e..c4f9480 100644
--- a/build/cmake/pkg_config.cmake
+++ b/build/cmake/pkg_config.cmake
@@ -13,7 +13,7 @@
 set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
                   "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR"
                   "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME"
-                  "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
+                  "CONFIG_MULTITHREAD")
 
 foreach(arg ${REQUIRED_ARGS})
   if("${${arg}}" STREQUAL "")
@@ -60,8 +60,9 @@
 endif()
 file(APPEND "${pkgconfig_file}" "\nConflicts:\n")
 file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n")
-if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
-  file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n")
+if(CONFIG_MULTITHREAD AND CMAKE_THREAD_LIBS_INIT)
+  file(APPEND "${pkgconfig_file}"
+       "Libs.private: -lm ${CMAKE_THREAD_LIBS_INIT}\n")
 else()
   file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
 endif()
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index bd3b9d5..1cf52f0 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -392,8 +392,9 @@
   @ALL_ARCHS = filter(qw/neon/);
   arm;
 } elsif ($opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon arm_crc32/);
-  &require(@ALL_ARCHS);
+  @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve/);
+  @REQUIRES = filter(qw/neon/);
+  &require(@REQUIRES);
   arm;
 } elsif ($opts{arch} eq 'ppc') {
   @ALL_ARCHS = filter(qw/vsx/);
diff --git a/build/cmake/toolchains/android.cmake b/build/cmake/toolchains/android.cmake
index 4d38c9a..fb08685 100644
--- a/build/cmake/toolchains/android.cmake
+++ b/build/cmake/toolchains/android.cmake
@@ -46,8 +46,6 @@
 
 if(ANDROID_ABI MATCHES "^arm")
   set(CMAKE_ASM_COMPILER as)
-  # No runtime cpu detect for arm targets.
-  set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
 elseif(ANDROID_ABI MATCHES "^x86")
   set(CMAKE_ASM_NASM_COMPILER yasm)
 endif()
diff --git a/build/cmake/toolchains/arm-ios-common.cmake b/build/cmake/toolchains/arm-ios-common.cmake
index 62ca115..2c433be 100644
--- a/build/cmake/toolchains/arm-ios-common.cmake
+++ b/build/cmake/toolchains/arm-ios-common.cmake
@@ -21,7 +21,4 @@
 set(CMAKE_CXX_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 
-# No runtime cpu detect for arm*-ios targets.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
-
 # TODO(tomfinegan): Handle bit code embedding.
diff --git a/build/cmake/toolchains/arm64-linux-clang.cmake b/build/cmake/toolchains/arm64-linux-clang.cmake
new file mode 100644
index 0000000..b4645cc
--- /dev/null
+++ b/build/cmake/toolchains/arm64-linux-clang.cmake
@@ -0,0 +1,30 @@
+#
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+set(TRIPLE aarch64-linux-gnu)
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_ASM_COMPILER clang)
+set(CMAKE_ASM_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
diff --git a/build/cmake/toolchains/arm64-linux-gcc.cmake b/build/cmake/toolchains/arm64-linux-gcc.cmake
index 133a96a..3d0dff0 100644
--- a/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -38,6 +38,3 @@
 
 # No intrinsics flag required for arm64-linux-gcc.
 set(AOM_NEON_INTRIN_FLAG "")
-
-# No runtime cpu detect for arm64-linux-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/arm64-mingw-gcc.cmake b/build/cmake/toolchains/arm64-mingw-gcc.cmake
index 7400423..95b26d3 100644
--- a/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -34,6 +34,3 @@
 if(NOT CMAKE_RANLIB)
   set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 endif()
-
-# No runtime cpu detect for arm64-mingw-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/armv7-linux-gcc.cmake b/build/cmake/toolchains/armv7-linux-gcc.cmake
index 366e198..aa05505 100644
--- a/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -44,6 +44,3 @@
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 
 set(AOM_NEON_INTRIN_FLAG "-mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
-
-# No runtime cpu detect for armv7-linux-gcc.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/build/cmake/toolchains/i686-linux-gcc.cmake b/build/cmake/toolchains/i686-linux-gcc.cmake
new file mode 100644
index 0000000..c4f6ab9
--- /dev/null
+++ b/build/cmake/toolchains/i686-linux-gcc.cmake
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to one used by Debian and other package
+  # management systems.
+  set(CROSS i686-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+  set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
diff --git a/common/tools_common.c b/common/tools_common.c
index afe4619..4d77a1b 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c
@@ -65,8 +65,8 @@
 void die_codec(aom_codec_ctx_t *ctx, const char *s) {
   const char *detail = aom_codec_error_detail(ctx);
 
-  printf("%s: %s\n", s, aom_codec_error(ctx));
-  if (detail) printf("    %s\n", detail);
+  fprintf(stderr, "%s: %s\n", s, aom_codec_error(ctx));
+  if (detail) fprintf(stderr, "    %s\n", detail);
   exit(EXIT_FAILURE);
 }
 
diff --git a/docs.cmake b/docs.cmake
index 0d8db92..0d7b4cf 100644
--- a/docs.cmake
+++ b/docs.cmake
@@ -223,7 +223,7 @@
   list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources)
   list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs)
   if(NOT ${num_sources} EQUAL ${num_descs})
-    message(FATAL_ERROR "Unqeual example and description totals.")
+    message(FATAL_ERROR "Unequal example and description totals.")
   endif()
 
   # Take the list of examples and produce example_basename.dox for each file in
diff --git a/examples/lightfield_tile_list_decoder.c b/examples/lightfield_tile_list_decoder.c
index 5b15ae0..d71ff5b 100644
--- a/examples/lightfield_tile_list_decoder.c
+++ b/examples/lightfield_tile_list_decoder.c
@@ -170,7 +170,7 @@
         if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
                                        frame_res[0], frame_res[1], 32, 8,
                                        border)) {
-          die("Failed to allocate references.");
+          fatal("Failed to allocate references.");
         }
       }
     }
diff --git a/examples/resize_util.c b/examples/resize_util.c
deleted file mode 100644
index 45a1db2..0000000
--- a/examples/resize_util.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "av1/common/resize.h"
-#include "common/tools_common.h"
-
-static const char *exec_name = NULL;
-
-static void usage() {
-  printf("Usage:\n");
-  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
-         exec_name);
-  printf("<output_yuv> [<frames>]\n");
-}
-
-void usage_exit(void) {
-  usage();
-  exit(EXIT_FAILURE);
-}
-
-static int parse_dim(char *v, int *width, int *height) {
-  char *x = strchr(v, 'x');
-  if (x == NULL) x = strchr(v, 'X');
-  if (x == NULL) return 0;
-  *width = atoi(v);
-  *height = atoi(&x[1]);
-  if (*width <= 0 || *height <= 0)
-    return 0;
-  else
-    return 1;
-}
-
-int main(int argc, char *argv[]) {
-  char *fin, *fout;
-  FILE *fpin, *fpout;
-  uint8_t *inbuf, *outbuf;
-  uint8_t *inbuf_u, *outbuf_u;
-  uint8_t *inbuf_v, *outbuf_v;
-  int f, frames;
-  int width, height, target_width, target_height;
-  int failed = 0;
-
-  exec_name = argv[0];
-
-  if (argc < 5) {
-    printf("Incorrect parameters:\n");
-    usage();
-    return 1;
-  }
-
-  fin = argv[1];
-  fout = argv[4];
-  if (!parse_dim(argv[2], &width, &height)) {
-    printf("Incorrect parameters: %s\n", argv[2]);
-    usage();
-    return 1;
-  }
-  if (!parse_dim(argv[3], &target_width, &target_height)) {
-    printf("Incorrect parameters: %s\n", argv[3]);
-    usage();
-    return 1;
-  }
-
-  fpin = fopen(fin, "rb");
-  if (fpin == NULL) {
-    printf("Can't open file %s to read\n", fin);
-    usage();
-    return 1;
-  }
-  fpout = fopen(fout, "wb");
-  if (fpout == NULL) {
-    fclose(fpin);
-    printf("Can't open file %s to write\n", fout);
-    usage();
-    return 1;
-  }
-  if (argc >= 6)
-    frames = atoi(argv[5]);
-  else
-    frames = INT_MAX;
-
-  printf("Input size:  %dx%d\n", width, height);
-  printf("Target size: %dx%d, Frames: ", target_width, target_height);
-  if (frames == INT_MAX)
-    printf("All\n");
-  else
-    printf("%d\n", frames);
-
-  inbuf = (uint8_t *)malloc(width * height * 3 / 2);
-  outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
-  if (!(inbuf && outbuf)) {
-    printf("Failed to allocate buffers.\n");
-    failed = 1;
-    goto Error;
-  }
-  inbuf_u = inbuf + width * height;
-  inbuf_v = inbuf_u + width * height / 4;
-  outbuf_u = outbuf + target_width * target_height;
-  outbuf_v = outbuf_u + target_width * target_height / 4;
-  f = 0;
-  while (f < frames) {
-    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
-    av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
-                        width, outbuf, target_width, outbuf_u, outbuf_v,
-                        target_width / 2, target_height, target_width);
-    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
-    f++;
-  }
-  printf("%d frames processed\n", f);
-Error:
-  fclose(fpin);
-  fclose(fpout);
-
-  free(inbuf);
-  free(outbuf);
-  return failed;
-}
diff --git a/examples/svc_encoder_rtc.cc b/examples/svc_encoder_rtc.cc
index 1730f89..c37df79 100644
--- a/examples/svc_encoder_rtc.cc
+++ b/examples/svc_encoder_rtc.cc
@@ -18,6 +18,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <memory>
+
 #include "config/aom_config.h"
 
 #if CONFIG_AV1_DECODER
@@ -30,6 +32,7 @@
 #include "common/video_writer.h"
 #include "examples/encoder_util.h"
 #include "aom_ports/aom_timer.h"
+#include "av1/ratectrl_rtc.h"
 
 #define OPTION_BUFFER_SIZE 1024
 
@@ -44,6 +47,7 @@
   int decode;
   int tune_content;
   int show_psnr;
+  bool use_external_rc;
 } AppInput;
 
 typedef enum {
@@ -99,6 +103,8 @@
             "Attempt to test decoding the output when set to 1. Default is 1.");
 static const arg_def_t psnr_arg =
     ARG_DEF(NULL, "psnr", -1, "Show PSNR in status line.");
+static const arg_def_t ext_rc_arg =
+    ARG_DEF(NULL, "use-ext-rc", 0, "Use external rate control.");
 static const struct arg_enum_list tune_content_enum[] = {
   { "default", AOM_CONTENT_DEFAULT },
   { "screen", AOM_CONTENT_SCREEN },
@@ -372,6 +378,8 @@
       printf("tune content %d\n", app_input->tune_content);
     } else if (arg_match(&arg, &psnr_arg, argi)) {
       app_input->show_psnr = 1;
+    } else if (arg_match(&arg, &ext_rc_arg, argi)) {
+      app_input->use_external_rc = true;
     } else {
       ++argj;
     }
@@ -429,10 +437,12 @@
       enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
 }
 
-static int mode_to_num_temporal_layers[11] = {
-  1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3
+static int mode_to_num_temporal_layers[12] = {
+  1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3, 3,
 };
-static int mode_to_num_spatial_layers[11] = { 1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3 };
+static int mode_to_num_spatial_layers[12] = {
+  1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 3,
+};
 
 // For rate control encoding stats.
 struct RateControlMetrics {
@@ -607,6 +617,7 @@
   int i;
   int enable_longterm_temporal_ref = 1;
   int shift = (layering_mode == 8) ? 2 : 0;
+  int simulcast_mode = (layering_mode == 11);
   *use_svc_control = 1;
   layer_id->spatial_layer_id = spatial_layer_id;
   int lag_index = 0;
@@ -1102,7 +1113,173 @@
           ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4;
         }
       }
-      if (layer_id->spatial_layer_id > 0) {
+      break;
+    case 11:
+      // Simulcast mode for 3 spatial and 3 temporal layers.
+      // No inter-layer predicton, only prediction is temporal and single
+      // reference (LAST).
+      // No overlap in buffer slots between spatial layers. So for example,
+      // SL0 only uses slots 0 and 1.
+      // SL1 only uses slots 2 and 3.
+      // SL2 only uses slots 4 and 5.
+      // All 7 references for each inter-frame must only access buffer slots
+      // for that spatial layer.
+      // On key (super)frames: SL1 and SL2 must have no references set
+      // and must refresh all the slots for that layer only (so 2 and 3
+      // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally
+      // as a Key frame (refresh all slots). SL1/SL2 will be labelled
+      // internally as Intra-only frames that allow that stream to be decoded.
+      // These conditions will allow for each spatial stream to be
+      // independently decodeable.
+
+      // Initialize all references to 0 (don't use reference).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+        ref_frame_config->reference[i] = 0;
+      // Initialize as no refresh/update for all slots.
+      for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+        ref_frame_config->ref_idx[i] = 0;
+
+      if (is_key_frame) {
+        if (layer_id->spatial_layer_id == 0) {
+          // Assign LAST/GOLDEN to slot 0/1.
+          // Refesh slots 0 and 1 for SL0.
+          // SL0: this will get set to KEY frame internally.
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 1;
+          ref_frame_config->refresh[0] = 1;
+          ref_frame_config->refresh[1] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Assign LAST/GOLDEN to slot 2/3.
+          // Refesh slots 2 and 3 for SL1.
+          // This will get set to Intra-only frame internally.
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+          ref_frame_config->refresh[2] = 1;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Assign LAST/GOLDEN to slot 4/5.
+          // Refresh slots 4 and 5 for SL2.
+          // This will get set to Intra-only frame internally.
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5;
+          ref_frame_config->refresh[4] = 1;
+          ref_frame_config->refresh[5] = 1;
+        }
+      } else if (superframe_cnt % 4 == 0) {
+        // Base temporal layer: TL0
+        layer_id->temporal_layer_id = 0;
+        if (layer_id->spatial_layer_id == 0) {  // SL0
+          // Reference LAST. Assign all references to either slot
+          // 0 or 1. Here we assign LAST to slot 0, all others to 1.
+          // Update slot 0 (LAST).
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+          ref_frame_config->refresh[0] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {  // SL1
+          // Reference LAST. Assign all references to either slot
+          // 2 or 3. Here we assign LAST to slot 2, all others to 3.
+          // Update slot 2 (LAST).
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+          ref_frame_config->refresh[2] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {  // SL2
+          // Reference LAST. Assign all references to either slot
+          // 4 or 5. Here we assign LAST to slot 4, all others to 5.
+          // Update slot 4 (LAST).
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 5;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+          ref_frame_config->refresh[4] = 1;
+        }
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        // First top temporal enhancement layer: TL2
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {  // SL0
+          // Reference LAST (slot 0). Assign other references to slot 1.
+          // No update/refresh on any slots.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+        } else if (layer_id->spatial_layer_id == 1) {  // SL1
+          // Reference LAST (slot 2). Assign other references to slot 3.
+          // No update/refresh on any slots.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+        } else if (layer_id->spatial_layer_id == 2) {  // SL2
+          // Reference LAST (slot 4). Assign other references to slot 4.
+          // No update/refresh on any slots.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 5;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+        }
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        // Middle temporal enhancement layer: TL1
+        layer_id->temporal_layer_id = 1;
+        if (layer_id->spatial_layer_id == 0) {  // SL0
+          // Reference LAST (slot 0).
+          // Set GOLDEN to slot 1 and update slot 1.
+          // This will be used as reference for next TL2.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+          ref_frame_config->refresh[1] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {  // SL1
+          // Reference LAST (slot 2).
+          // Set GOLDEN to slot 3 and update slot 3.
+          // This will be used as reference for next TL2.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {  // SL2
+          // Reference LAST (slot 4).
+          // Set GOLDEN to slot 5 and update slot 5.
+          // This will be used as reference for next TL2.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 5;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+          ref_frame_config->refresh[5] = 1;
+        }
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        // Second top temporal enhancement layer: TL2
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {  // SL0
+          // Reference LAST (slot 1). Assign other references to slot 0.
+          // No update/refresh on any slots.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {  // SL1
+          // Reference LAST (slot 3). Assign other references to slot 2.
+          // No update/refresh on any slots.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 2;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 3;
+        } else if (layer_id->spatial_layer_id == 2) {  // SL2
+          // Reference LAST (slot 5). Assign other references to slot 4.
+          // No update/refresh on any slots.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5;
+        }
+      }
+      if (!simulcast_mode && layer_id->spatial_layer_id > 0) {
         // Always reference GOLDEN (inter-layer prediction).
         ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
         if (ksvc_mode) {
@@ -1120,8 +1297,8 @@
       // allow for top spatial layer to use additional temporal reference.
       // Additional reference is only updated on base temporal layer, every
       // 10 TL0 frames here.
-      if (enable_longterm_temporal_ref && layer_id->spatial_layer_id == 2 &&
-          layering_mode == 8) {
+      if (!simulcast_mode && enable_longterm_temporal_ref &&
+          layer_id->spatial_layer_id == 2 && layering_mode == 8) {
         ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
         if (!is_key_frame) ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
         if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
@@ -1220,6 +1397,51 @@
   fprintf(stderr, "\n");
 }
 
+static aom::AV1RateControlRtcConfig create_rtc_rc_config(
+    const aom_codec_enc_cfg_t &cfg, const AppInput &app_input) {
+  aom::AV1RateControlRtcConfig rc_cfg;
+  rc_cfg.width = cfg.g_w;
+  rc_cfg.height = cfg.g_h;
+  rc_cfg.max_quantizer = cfg.rc_max_quantizer;
+  rc_cfg.min_quantizer = cfg.rc_min_quantizer;
+  rc_cfg.target_bandwidth = cfg.rc_target_bitrate;
+  rc_cfg.buf_initial_sz = cfg.rc_buf_initial_sz;
+  rc_cfg.buf_optimal_sz = cfg.rc_buf_optimal_sz;
+  rc_cfg.buf_sz = cfg.rc_buf_sz;
+  rc_cfg.overshoot_pct = cfg.rc_overshoot_pct;
+  rc_cfg.undershoot_pct = cfg.rc_undershoot_pct;
+  // This is hardcoded as AOME_SET_MAX_INTRA_BITRATE_PCT
+  rc_cfg.max_intra_bitrate_pct = 300;
+  rc_cfg.framerate = cfg.g_timebase.den;
+  // TODO(jianj): Add suppor for SVC.
+  rc_cfg.ss_number_layers = 1;
+  rc_cfg.ts_number_layers = 1;
+  rc_cfg.scaling_factor_num[0] = 1;
+  rc_cfg.scaling_factor_den[0] = 1;
+  rc_cfg.layer_target_bitrate[0] = static_cast<int>(rc_cfg.target_bandwidth);
+  rc_cfg.max_quantizers[0] = rc_cfg.max_quantizer;
+  rc_cfg.min_quantizers[0] = rc_cfg.min_quantizer;
+  rc_cfg.aq_mode = app_input.aq_mode;
+
+  return rc_cfg;
+}
+
+static int qindex_to_quantizer(int qindex) {
+  // Table that converts 0-63 range Q values passed in outside to the 0-255
+  // range Qindex used internally.
+  static const int quantizer_to_qindex[] = {
+    0,   4,   8,   12,  16,  20,  24,  28,  32,  36,  40,  44,  48,
+    52,  56,  60,  64,  68,  72,  76,  80,  84,  88,  92,  96,  100,
+    104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+    156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+    208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+  };
+  for (int quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+  return 63;
+}
+
 int main(int argc, const char **argv) {
   AppInput app_input;
   AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
@@ -1447,6 +1669,12 @@
     aom_codec_control(&codec, AV1E_SET_ENABLE_INTRABC, 0);
   }
 
+  if (app_input.use_external_rc) {
+    aom_codec_control(&codec, AV1E_SET_RTC_EXTERNAL_RC, 1);
+  }
+
+  aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX);
+
   svc_params.number_spatial_layers = ss_number_layers;
   svc_params.number_temporal_layers = ts_number_layers;
   for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
@@ -1483,6 +1711,13 @@
     frame_cnt_layer[lx] = 0;
   }
 
+  std::unique_ptr<aom::AV1RateControlRTC> rc_api;
+  if (app_input.use_external_rc) {
+    const aom::AV1RateControlRtcConfig rc_cfg =
+        create_rtc_rc_config(cfg, app_input);
+    rc_api = aom::AV1RateControlRTC::Create(rc_cfg);
+  }
+
   frame_avail = 1;
   struct psnr_stats psnr_stream;
   memset(&psnr_stream, 0, sizeof(psnr_stream));
@@ -1621,6 +1856,21 @@
           die_codec(&codec, "Failed to SET_BITRATE_ONE_PASS_CBR");
       }
 
+      if (rc_api) {
+        aom::AV1FrameParamsRTC frame_params;
+        // TODO(jianj): Add support for SVC.
+        frame_params.spatial_layer_id = 0;
+        frame_params.temporal_layer_id = 0;
+        frame_params.frame_type =
+            is_key_frame ? aom::kKeyFrame : aom::kInterFrame;
+        rc_api->ComputeQP(frame_params);
+        const int current_qp = rc_api->GetQP();
+        if (aom_codec_control(&codec, AV1E_SET_QUANTIZER_ONE_PASS,
+                              qindex_to_quantizer(current_qp))) {
+          die_codec(&codec, "Failed to SET_QUANTIZER_ONE_PASS");
+        }
+      }
+
       // Do the layer encode.
       aom_usec_timer_start(&timer);
       if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags))
@@ -1631,10 +1881,14 @@
       frame_cnt_layer[layer] += 1;
 
       got_data = 0;
+      // For simulcast (mode 11): write out each spatial layer to the file.
+      int ss_layers_write = (app_input.layering_mode == 11)
+                                ? layer_id.spatial_layer_id + 1
+                                : ss_number_layers;
       while ((pkt = aom_codec_get_cx_data(&codec, &iter))) {
         switch (pkt->kind) {
           case AOM_CODEC_CX_FRAME_PKT:
-            for (int sl = layer_id.spatial_layer_id; sl < ss_number_layers;
+            for (int sl = layer_id.spatial_layer_id; sl < ss_layers_write;
                  ++sl) {
               for (int tl = layer_id.temporal_layer_id; tl < ts_number_layers;
                    ++tl) {
@@ -1675,6 +1929,9 @@
               if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id];
             }
 
+            if (rc_api) {
+              rc_api->PostEncodeUpdate(pkt->data.frame.sz);
+            }
             // Update for short-time encoding bitrate states, for moving window
             // of size rc->window, shifted by rc->window / 2.
             // Ignore first window segment, due to key frame.
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index a9f7f85..979ee6b 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -27,15 +27,15 @@
   static const int kHeight = 144;
 
   ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapTest() {}
+  ~ActiveMapTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
diff --git a/test/allintra_end_to_end_test.cc b/test/allintra_end_to_end_test.cc
index 98a7973..8ec24aa 100644
--- a/test/allintra_end_to_end_test.cc
+++ b/test/allintra_end_to_end_test.cc
@@ -56,25 +56,25 @@
         deltaq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
         tile_columns_(GET_PARAM(5)), enable_tx_size_search_(GET_PARAM(6)) {}
 
-  virtual ~AllIntraEndToEndTest() {}
+  ~AllIntraEndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kAllIntra);
     cfg_.g_threads = threads_;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_ROW_MT, 1);
       encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 002a206..081123c 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -58,9 +58,9 @@
         rc_end_usage_(GET_PARAM(2)) {
     is_arf_frame_present_ = 0;
   }
-  virtual ~AltRefFramePresenceTestLarge() {}
+  ~AltRefFramePresenceTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(altref_test_params_.encoding_mode);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -71,10 +71,10 @@
     cfg_.g_lag_in_frames = altref_test_params_.lag_in_frames;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -85,8 +85,8 @@
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (is_arf_frame_present_ != 1 && AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -149,9 +149,9 @@
     limit_ = 60;
     frame_num_ = 0;
   }
-  virtual ~GoldenFrameIntervalTestLarge() {}
+  ~GoldenFrameIntervalTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(gf_interval_param_.encoding_mode);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -166,10 +166,10 @@
     cfg_.rc_target_bitrate = 1000;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -189,7 +189,7 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     (void)pkt;
     ++frame_num_;
   }
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index b4a8b61..674a883 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -32,16 +32,16 @@
       public ::libaom_test::EncoderTest {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AqSegmentTest() {}
+  ~AqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
     aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index 63ccdfc..f51444d 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc
@@ -80,9 +80,9 @@
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  virtual ~ArfFreqTestLarge() {}
+  ~ArfFreqTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(test_encode_param_.mode);
     if (test_encode_param_.mode != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 25;
@@ -93,7 +93,7 @@
     }
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
@@ -115,7 +115,7 @@
     return frames;
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
     const int frames = GetNumFramesInPkt(pkt);
     if (frames == 1) {
@@ -134,8 +134,8 @@
     }
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
diff --git a/test/av1_c_vs_simd_encode.sh b/test/av1_c_vs_simd_encode.sh
new file mode 100755
index 0000000..29a2f99
--- /dev/null
+++ b/test/av1_c_vs_simd_encode.sh
@@ -0,0 +1,559 @@
+#!/bin/sh
+## Copyright (c) 2023, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+##  This script checks the bit exactness between C and SIMD
+##  implementations of AV1 encoder.
+##
+. $(dirname $0)/tools_common.sh
+
+PRESETS="good rt"
+LOWBD_CIF_CLIP="yuv_raw_input"
+LOWBD_480p_CLIP="yuv_480p_raw_input"
+LOWBD_720p_CLIP="y4m_720p_input"
+HIGHBD_CLIP="y4m_360p_10bit_input"
+SC_CLIP="y4m_screen_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd)
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBAOM_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+Y4M_SCREEN_INPUT="${LIBAOM_TEST_DATA_PATH}/wikipedia_420_360p_60f.y4m"
+
+# Number of frames to test.
+AV1_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=35
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+  AOM_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  AOM_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  AOM_TEST_TEMP_ROOT=/tmp
+fi
+
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_TEMP_ROOT}/av1_test_$$"
+
+if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+elog() {
+  echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${AOM_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+av1_enc_tool_path() {
+  local target="$1"
+  local preset="$2"
+  local tool_path="${AOM_TEST_OUTPUT_DIR}/build_target_${target}/aomenc_${preset}"
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+av1_c_vs_simd_enc_verify_environment () {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+    elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+    elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_720P_INPUT}" ]; then
+    elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_SCREEN_INPUT}" ]; then
+    elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -d "$LIBAOM_SOURCE_DIR" ]; then
+    elog "LIBAOM_SOURCE_DIR does not exist."
+    return 1
+  fi
+}
+
+# This is not needed since tools_common.sh does the same cleanup.
+# Keep the code here for our reference.
+# cleanup() {
+#  rm -rf  ${AOM_TEST_OUTPUT_DIR}
+# }
+
+# Echo AOM_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+   echo "0x1FF"
+}
+
+avx2() {
+   echo "0x0FF"
+}
+
+avx() {
+   echo "0x07F"
+}
+
+sse4_1() {
+   echo "0x03F"
+}
+
+ssse3() {
+   echo "0x01F"
+}
+
+sse3() {
+   echo "0x00F"
+}
+
+sse2() {
+   echo "0x007"
+}
+
+get_bitrates() {
+  local content=$1
+  local preset=$2
+
+  # Bit-rates:
+  local bitrate_lowres_good="300"
+  local bitrate_480p_good="500"
+  local bitrate_720p_good="1000"
+  local bitrate_scc_360p_good="500"
+  local bitrate_lowres_rt="200"
+  local bitrate_480p_rt="300"
+  local bitrate_720p_rt="600"
+  local bitrate_scc_360p_rt="300"
+  local bitrate_hbd_360p="500"
+
+  if [ "${preset}" = "good" ]; then
+    if [ "${content}" = "yuv_raw_input" ]; then
+      echo "${bitrate_lowres_good}"
+    elif [ "${content}" = "yuv_480p_raw_input" ]; then
+      echo "${bitrate_480p_good}"
+    elif [ "${content}" = "y4m_720p_input" ]; then
+      echo "${bitrate_720p_good}"
+    elif [ "${content}" = "y4m_screen_input" ]; then
+      echo "${bitrate_scc_360p_good}"
+    elif [ "${content}" = "y4m_360p_10bit_input" ]; then
+      echo "${bitrate_hbd_360p}"
+    else
+      elog "Invalid content"
+    fi
+  elif  [ "${preset}" = "rt" ]; then
+    if [ "${content}" = "yuv_raw_input" ]; then
+      echo "${bitrate_lowres_rt}"
+    elif [ "${content}" = "yuv_480p_raw_input" ]; then
+      echo "${bitrate_480p_rt}"
+    elif [ "${content}" = "y4m_720p_input" ]; then
+      echo "${bitrate_720p_rt}"
+    elif [ "${content}" = "y4m_screen_input" ]; then
+      echo "${bitrate_scc_360p_rt}"
+    elif [ "${content}" = "y4m_360p_10bit_input" ]; then
+      echo "${bitrate_hbd_360p}"
+    else
+      elog "Invalid content"
+    fi
+  else
+    elog "invalid preset"
+  fi
+}
+
+# Echo clip details to be used as input to aomenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width=352
+       --height=288
+       --bit-depth=8"
+}
+
+y4m_360p_10bit_input() {
+  echo ""${Y4M_360P_10BIT_INPUT}"
+       --bit-depth=10"
+}
+
+yuv_480p_raw_input() {
+  echo ""${YUV_480P_RAW_INPUT}"
+       --width=640
+       --height=480
+       --bit-depth=8"
+}
+
+y4m_720p_input() {
+  echo ""${Y4M_720P_INPUT}"
+       --bit-depth=8"
+}
+
+y4m_screen_input() {
+  echo ""${Y4M_SCREEN_INPUT}"
+       --tune-content=screen
+       --enable-palette=1
+       --bit-depth=8"
+}
+
+has_x86_isa_extn() {
+  instruction_set=$1
+  if ! grep -q "$instruction_set" /proc/cpuinfo; then
+    # This instruction set is not supported.
+    return 1
+  fi
+}
+
+# Echo good encode params for use with AV1 encoder.
+av1_encode_good_params() {
+  echo "--good \
+  --ivf \
+  --profile=0 \
+  --static-thresh=0 \
+  --threads=1 \
+  --tile-columns=0 \
+  --tile-rows=0 \
+  --verbose \
+  --end-usage=vbr \
+  --kf-max-dist=160 \
+  --kf-min-dist=0 \
+  --max-q=63 \
+  --min-q=0 \
+  --overshoot-pct=100 \
+  --undershoot-pct=100 \
+  --passes=2 \
+  --arnr-maxframes=7 \
+  --arnr-strength=5 \
+  --auto-alt-ref=1 \
+  --drop-frame=0 \
+  --frame-parallel=0 \
+  --lag-in-frames=35 \
+  --maxsection-pct=2000 \
+  --minsection-pct=0 \
+  --sharpness=0"
+}
+
+# Echo realtime encode params for use with AV1 encoder.
+av1_encode_rt_params() {
+  echo "--rt \
+  --ivf \
+  --profile=0 \
+  --static-thresh=0 \
+  --threads=1 \
+  --tile-columns=0 \
+  --tile-rows=0 \
+  --verbose \
+  --end-usage=cbr \
+  --kf-max-dist=90000 \
+  --max-q=58 \
+  --min-q=2 \
+  --overshoot-pct=50 \
+  --undershoot-pct=50 \
+  --passes=1 \
+  --aq-mode=3 \
+  --buf-initial-sz=500 \
+  --buf-optimal-sz=600 \
+  --buf-sz=1000 \
+  --coeff-cost-upd-freq=3 \
+  --dv-cost-upd-freq=3 \
+  --mode-cost-upd-freq=3 \
+  --mv-cost-upd-freq=3 \
+  --deltaq-mode=0 \
+  --enable-global-motion=0 \
+  --enable-obmc=0 \
+  --enable-order-hint=0 \
+  --enable-ref-frame-mvs=0 \
+  --enable-tpl-model=0 \
+  --enable-warped-motion=0 \
+  --lag-in-frames=0 \
+  --max-intra-rate=300 \
+  --noise-sensitivity=0"
+}
+
+# Configures for the given target in AOM_TEST_OUTPUT_DIR/build_target_${target}
+# directory.
+av1_enc_build() {
+  local target="$1"
+  local cmake_command="$2"
+  local tmp_build_dir=${AOM_TEST_OUTPUT_DIR}/build_target_${target}
+  if [ -d "$tmp_build_dir" ]; then
+    rm -rf $tmp_build_dir
+  fi
+
+  mkdir -p $tmp_build_dir
+  cd $tmp_build_dir
+
+  local cmake_common_args="-DCONFIG_EXCLUDE_SIMD_MISMATCH=1 \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DENABLE_CCACHE=1 \
+           '-DCMAKE_C_FLAGS_RELEASE=-O3 -g' \
+           '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g'"
+
+  for preset in $PRESETS; do
+    echo "Building target[${preset} encoding]: ${target}"
+    if [ "${preset}" = "good" ]; then
+      local cmake_extra_args="-DCONFIG_AV1_HIGHBITDEPTH=1"
+    elif [ "${preset}" = "rt" ]; then
+      local cmake_extra_args="-DCONFIG_REALTIME_ONLY=1 -DCONFIG_AV1_HIGHBITDEPTH=0"
+    else
+      elog "Invalid preset"
+      return 1
+    fi
+    eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" ${devnull}
+    eval make -j$(nproc) ${devnull}
+    mv aomenc aomenc_${preset}
+  done
+  echo "Done building target: ${target}"
+}
+
+compare_enc_output() {
+  local target=$1
+  local cpu=$2
+  local clip=$3
+  local bitrate=$4
+  local preset=$5
+  if ! diff -q ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+       ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then
+    elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+    return 1
+  fi
+}
+
+av1_enc_test() {
+  local encoder="$1"
+  local arch="$2"
+  local target="$3"
+  local preset="$4"
+  if [ -z "$(av1_enc_tool_path "${target}"  "${preset}")" ]; then
+    elog "aomenc_{preset} not found. It must exist in ${AOM_TEST_OUTPUT_DIR}/build_target_${target} path"
+    return 1
+  fi
+
+  if [ "${preset}" = "good" ]; then
+    if [ "${arch}" = "x86_64" ]; then
+      local min_cpu_used=0
+      local max_cpu_used=6
+    elif [ "${arch}" = "x86" ]; then
+      local min_cpu_used=2
+      local max_cpu_used=3
+    fi
+    local test_params=av1_encode_good_params
+  elif [ "${preset}" = "rt" ]; then
+    local min_cpu_used=5
+    local max_cpu_used=11
+    local test_params=av1_encode_rt_params
+  else
+    elog "Invalid preset"
+    return 1
+  fi
+
+  for cpu in $(seq $min_cpu_used $max_cpu_used); do
+    if [ "${preset}" = "good" ]; then
+      if [ "${arch}" = "x86_64" ]; then
+        if [ "${cpu}" -lt 2 ]; then
+          local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}"
+        elif [ "${cpu}" -lt 5 ]; then
+          local test_clips="${LOWBD_480p_CLIP} ${HIGHBD_CLIP}"
+        else
+          local test_clips="${LOWBD_720p_CLIP} ${HIGHBD_CLIP}"
+        fi
+      elif [ "${arch}" = "x86" ]; then
+        local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}"
+      elif [ "${arch}" = "arm64" ]; then
+        # TODO(BUG=aomedia:3474): Enable testing of high bit-depth clips after
+        # fixing C vs SIMD mismatches.
+        local test_clips="${LOWBD_CIF_CLIP}"
+      fi
+    elif [ "${preset}" = "rt" ]; then
+      if [ "${cpu}" -lt 8 ]; then
+        local test_clips="${LOWBD_CIF_CLIP} ${SC_CLIP}"
+      else
+        local test_clips="${LOWBD_480p_CLIP} ${SC_CLIP}"
+      fi
+    else
+      elog "Invalid preset"
+      return 1
+    fi
+
+    for clip in ${test_clips}; do
+      local test_bitrates=$(get_bitrates ${clip} ${preset})
+      for bitrate in ${test_bitrates}; do
+        eval "${encoder}" $($clip) $($test_params) \
+        "--limit=${AV1_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+        "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+        ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+        ${devnull}
+
+        if [ "${target}" != "generic" ]; then
+          if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
+            # Found a mismatch
+            return 1
+          fi
+        fi
+      done
+    done
+  done
+}
+
+av1_test_generic() {
+  local arch=$1
+  local target="generic"
+  if [ $arch = "x86_64" ]; then
+    local cmake_command="cmake $LIBAOM_SOURCE_DIR -DAOM_TARGET_CPU=${target}"
+  elif [ $arch = "x86" ]; then
+    # As AV1 encode output differs for x86 32-bit and 64-bit platforms
+    # (BUG=aomedia:3479), the x86 32-bit C-only build is generated separately.
+    # The cmake command line option -DENABLE_MMX=0 flag disables all SIMD
+    # optimizations, and generates a C-only binary.
+    local cmake_command="cmake $LIBAOM_SOURCE_DIR -DENABLE_MMX=0 \
+      -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake"
+  fi
+
+  echo "Build for: Generic ${arch}"
+  av1_enc_build "${target}" "${cmake_command}"
+
+  for preset in $PRESETS; do
+    local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+    av1_enc_test $encoder "${arch}" "${target}" "${preset}"
+  done
+}
+
+# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are
+# no functions with MMX, SSE and AVX512 specialization.
+# The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX512 and lower variants
+#   0     1    1    1      1    1    1    1   1  -> 0x0FF -> Enable AVX2 and lower variants
+#   0     0    1    1      1    1    1    1   1  -> 0x07F -> Enable AVX and lower variants
+#   0     0    0    1      1    1    1    1   1  -> 0x03F  -> Enable SSE4_1 and lower variants
+#   0     0    0    0      1    1    1    1   1  -> 0x01F  -> Enable SSSE3 and lower variants
+#   0     0    0    0      0    1    1    1   1  -> 0x00F  -> Enable SSE3 and lower variants
+#   0     0    0    0      0    0    1    1   1  -> 0x007  -> Enable SSE2 and lower variants
+#   0     0    0    0      0    0    0    1   1  -> 0x003  -> Enable SSE and lower variants
+#   0     0    0    0      0    0    0    0   1  -> 0x001  -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as
+#  all x86_64 platforms implement sse2.
+av1_test_x86() {
+  local arch=$1
+
+  if ! uname -m | grep -q "x86"; then
+    elog "Machine architecture is not x86 or x86_64"
+    return 0
+  fi
+
+  if [ $arch = "x86" ]; then
+    local target="x86-linux"
+    local cmake_command="cmake \
+    $LIBAOM_SOURCE_DIR \
+    -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake"
+  elif [ $arch = "x86_64" ]; then
+    local target="x86_64-linux"
+    local cmake_command="cmake $LIBAOM_SOURCE_DIR"
+  fi
+
+  # Available x86 isa variants: "avx2 avx sse4_1 ssse3 sse3 sse2"
+  local x86_isa_variants="avx2 sse4_1 sse2"
+
+  echo "Build for x86: ${target}"
+  av1_enc_build "${target}" "${cmake_command}"
+
+  for preset in $PRESETS; do
+    local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+    for isa in $x86_isa_variants; do
+      # Note that if has_x86_isa_extn returns 1, it is false, and vice versa.
+      if ! has_x86_isa_extn $isa; then
+        echo "${isa} is not supported in this machine"
+        continue
+      fi
+      export AOM_SIMD_CAPS_MASK=$($isa)
+      if ! av1_enc_test $encoder "${arch}" "${target}" "${preset}"; then
+        # Found a mismatch
+        return 1
+      fi
+      unset AOM_SIMD_CAPS_MASK
+    done
+  done
+}
+
+av1_test_arm() {
+  local arch="arm64"
+  local target="arm64-linux-gcc"
+  local cmake_command="cmake $LIBAOM_SOURCE_DIR \
+        -DCMAKE_TOOLCHAIN_FILE=$LIBAOM_SOURCE_DIR/build/cmake/toolchains/${target}.cmake \
+        -DCMAKE_C_FLAGS=-Wno-maybe-uninitialized"
+  echo "Build for arm64: ${target}"
+  av1_enc_build "${target}" "${cmake_command}"
+
+  for preset in $PRESETS; do
+    # Enable armv8 test for real-time only
+    # TODO(BUG=aomedia:3486, BUG=aomedia:3474): Enable testing for 'good' preset
+    # after fixing C vs NEON mismatches.
+    if [ "${preset}" = "good" ]; then
+      continue
+    fi
+    local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+    if ! av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${arch}" "${target}" "${preset}"; then
+      # Found a mismatch
+      return 1
+    fi
+  done
+}
+
+av1_c_vs_simd_enc_test () {
+  # Test x86 (32 bit)
+  # x86 requires the i686-linux-gnu toolchain:
+  # $ sudo apt-get install g++-i686-linux-gnu
+  echo "av1 test for x86 (32 bit): Started."
+  # Encode 'C' only
+  av1_test_generic "x86"
+  # Encode with SIMD optimizations enabled
+  if ! av1_test_x86 "x86"; then
+    echo "av1 test for x86 (32 bit): Done, test failed."
+    return 1
+  else
+    echo "av1 test for x86 (32 bit): Done, all tests passed."
+  fi
+
+  # Test x86_64 (64 bit)
+  if [ "$(eval uname -m)" = "x86_64" ]; then
+    echo "av1 test for x86_64 (64 bit): Started."
+    # Encode 'C' only
+    av1_test_generic "x86_64"
+    # Encode with SIMD optimizations enabled
+    if ! av1_test_x86 "x86_64"; then
+      echo "av1 test for x86_64 (64 bit): Done, test failed."
+      return 1
+    else
+      echo "av1 test for x86_64 (64 bit): Done, all tests passed."
+    fi
+  fi
+
+  # Test ARM
+  echo "av1_test_arm: Started."
+  if ! av1_test_arm; then
+    echo "av1 test for arm: Done, test failed."
+    return 1
+  else
+    echo "av1 test for arm: Done, all tests passed."
+  fi
+}
+
+run_tests av1_c_vs_simd_enc_verify_environment av1_c_vs_simd_enc_test
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index c321de2..76cf77a 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -258,14 +258,13 @@
 class ConvolveScaleTestBase : public ::testing::Test {
  public:
   ConvolveScaleTestBase() : image_(nullptr) {}
-  virtual ~ConvolveScaleTestBase() { delete image_; }
-  virtual void TearDown() {}
+  ~ConvolveScaleTestBase() override { delete image_; }
 
   // Implemented by subclasses (SetUp depends on the parameters passed
   // in and RunOne depends on the function to be tested. These can't
   // be templated for low/high bit depths because they have different
   // numbers of parameters)
-  virtual void SetUp() = 0;
+  void SetUp() override = 0;
   virtual void RunOne(bool ref) = 0;
 
  protected:
@@ -407,9 +406,9 @@
     : public ConvolveScaleTestBase<uint8_t>,
       public ::testing::WithParamInterface<LowBDParams> {
  public:
-  virtual ~LowBDConvolveScaleTest() {}
+  ~LowBDConvolveScaleTest() override = default;
 
-  void SetUp() {
+  void SetUp() override {
     tst_fun_ = GET_PARAM(0);
 
     const BlockDimension &block = GET_PARAM(1);
@@ -421,7 +420,7 @@
     SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
   }
 
-  void RunOne(bool ref) {
+  void RunOne(bool ref) override {
     const uint8_t *src = image_->GetSrcData(ref, false);
     uint8_t *dst = image_->GetDstData(ref, false);
     convolve_params_.dst = image_->GetDst16Data(ref, false);
@@ -490,9 +489,9 @@
     : public ConvolveScaleTestBase<uint16_t>,
       public ::testing::WithParamInterface<HighBDParams> {
  public:
-  virtual ~HighBDConvolveScaleTest() {}
+  ~HighBDConvolveScaleTest() override = default;
 
-  void SetUp() {
+  void SetUp() override {
     tst_fun_ = GET_PARAM(0);
 
     const BlockDimension &block = GET_PARAM(1);
@@ -504,7 +503,7 @@
     SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
   }
 
-  void RunOne(bool ref) {
+  void RunOne(bool ref) override {
     const uint16_t *src = image_->GetSrcData(ref, false);
     uint16_t *dst = image_->GetDstData(ref, false);
     convolve_params_.dst = image_->GetDst16Data(ref, false);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 873960d..5bbac21 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -183,14 +183,12 @@
 template <typename T>
 class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
  public:
-  ~AV1ConvolveTest() override { TearDown(); }
+  ~AV1ConvolveTest() override = default;
 
   void SetUp() override {
     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   }
 
-  void TearDown() override {}
-
   // Randomizes the 8-bit input buffer and returns a pointer to it. Note that
   // the pointer is safe to use with an 8-tap filter. The stride can range
   // from width to (width + kPadding). Also note that the pointer is to the
@@ -427,6 +425,99 @@
                          BuildLowbdParams(av1_convolve_x_sr_neon));
 #endif
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_neon_i8mm));
+#endif
+
+////////////////////////////////////////////////////////////////
+// Single reference convolve-x IntraBC functions (low bit-depth)
+////////////////////////////////////////////////////////////////
+
+class AV1ConvolveXIntraBCTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+  void RunTest() {
+    // IntraBC functions only operate for subpel_x_qn = 8.
+    constexpr int kSubX = 8;
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const uint8_t *input = FirstRandomInput8(GetParam());
+
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    // Use a stride different from width to avoid potential storing errors that
+    // would go undetected. The input buffer is filled using a padding of 12, so
+    // the stride can be anywhere between width and width + 12.
+    av1_convolve_x_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+                                width, height, filter_params_x, kSubX,
+                                &conv_params1);
+
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    test_func(input, width + 2, test, kOutputStride, width, height,
+              filter_params_x, kSubX, &conv_params2);
+
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void SpeedTest() {
+    constexpr int kNumIters = 10000;
+    const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const uint8_t *input = FirstRandomInput8(GetParam());
+
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      av1_convolve_x_sr_intrabc_c(input, width, reference, kOutputStride, width,
+                                  height, filter_params_x, 0, &conv_params1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_x, 0, &conv_params2);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveXIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXIntraBCTest,
+                         BuildLowbdParams(av1_convolve_x_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXIntraBCTest,
+                         BuildLowbdParams(av1_convolve_x_sr_intrabc_neon));
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
 /////////////////////////////////////////////////////////
 // Single reference convolve-x functions (high bit-depth)
@@ -540,6 +631,94 @@
                          BuildHighbdParams(av1_highbd_convolve_x_sr_neon));
 #endif
 
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-x IntraBC functions (high bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1ConvolveXHighbdIntraBCTest
+    : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+  void RunTest() {
+    // IntraBC functions only operate for subpel_x_qn = 8.
+    constexpr int kSubX = 8;
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const uint16_t *input = FirstRandomInput16(GetParam());
+
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    // Use a stride different from width to avoid potential storing errors that
+    // would go undetected. The input buffer is filled using a padding of 12, so
+    // the stride can be anywhere between width and width + 12.
+    av1_highbd_convolve_x_sr_intrabc_c(
+        input, width + 2, reference, kOutputStride, width, height,
+        filter_params_x, kSubX, &conv_params1, bit_depth);
+
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+                              height, filter_params_x, kSubX, &conv_params2,
+                              bit_depth);
+
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void SpeedTest() {
+    constexpr int kNumIters = 10000;
+    const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const uint16_t *input = FirstRandomInput16(GetParam());
+
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      av1_highbd_convolve_x_sr_intrabc_c(input, width, reference, kOutputStride,
+                                         width, height, filter_params_x, 0,
+                                         &conv_params1, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    highbd_convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_x, 0, &conv_params2, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveXHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdIntraBCTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1ConvolveXHighbdIntraBCTest,
+    BuildHighbdParams(av1_highbd_convolve_x_sr_intrabc_neon));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 ////////////////////////////////////////////////////////
@@ -643,6 +822,80 @@
                          BuildLowbdParams(av1_convolve_y_sr_neon));
 #endif
 
+////////////////////////////////////////////////////////////////
+// Single reference convolve-y IntraBC functions (low bit-depth)
+////////////////////////////////////////////////////////////////
+
+class AV1ConvolveYIntraBCTest : public AV1ConvolveTest<convolve_y_func> {
+ public:
+  void RunTest() {
+    // IntraBC functions only operate for subpel_y_qn = 8.
+    constexpr int kSubY = 8;
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+    const uint8_t *input = FirstRandomInput8(GetParam());
+
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    // Use a stride different from width to avoid potential storing errors that
+    // would go undetected. The input buffer is filled using a padding of 12, so
+    // the stride can be anywhere between width and width + 12.
+    av1_convolve_y_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+                                width, height, filter_params_y, kSubY);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+                              height, filter_params_y, kSubY);
+
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void SpeedTest() {
+    constexpr int kNumIters = 10000;
+    const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      av1_convolve_y_sr_intrabc_c(input, width, reference, kOutputStride, width,
+                                  height, filter_params_y, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    convolve_y_func test_func = GetParam().TestFunction();
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_y, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveYIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYIntraBCTest,
+                         BuildLowbdParams(av1_convolve_y_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYIntraBCTest,
+                         BuildLowbdParams(av1_convolve_y_sr_intrabc_neon));
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
 /////////////////////////////////////////////////////////
 // Single reference convolve-y functions (high bit-depth)
@@ -745,6 +998,86 @@
                          BuildHighbdParams(av1_highbd_convolve_y_sr_neon));
 #endif
 
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-y IntraBC functions (high bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1ConvolveYHighbdIntraBCTest
+    : public AV1ConvolveTest<highbd_convolve_y_func> {
+ public:
+  void RunTest() {
+    // IntraBC functions only operate for subpel_y_qn = 8.
+    constexpr int kSubY = 8;
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+    const uint16_t *input = FirstRandomInput16(GetParam());
+
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    // Use a stride different from width to avoid potential storing errors that
+    // would go undetected. The input buffer is filled using a padding of 12, so
+    // the stride can be anywhere between width and width + 12.
+    av1_highbd_convolve_y_sr_intrabc_c(input, width + 2, reference,
+                                       kOutputStride, width, height,
+                                       filter_params_y, kSubY, bit_depth);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+                              height, filter_params_y, kSubY, bit_depth);
+
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void SpeedTest() {
+    constexpr int kNumIters = 10000;
+    const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      av1_highbd_convolve_y_sr_intrabc_c(input, width, reference, kOutputStride,
+                                         width, height, filter_params_y, 0,
+                                         bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    highbd_convolve_y_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_y, 0, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveYHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdIntraBCTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1ConvolveYHighbdIntraBCTest,
+    BuildHighbdParams(av1_highbd_convolve_y_sr_intrabc_neon));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 //////////////////////////////////////////////////////////////
@@ -830,6 +1163,11 @@
                          BuildHighbdParams(aom_highbd_convolve_copy_avx2));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_neon));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /////////////////////////////////////////////////////////
@@ -958,6 +1296,104 @@
                          BuildLowbdParams(av1_convolve_2d_sr_neon));
 #endif
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_neon_i8mm));
+#endif
+
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-2D IntraBC functions (low bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1Convolve2DIntraBCTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+  void RunTest() {
+    // IntraBC functions only operate for subpel_x_qn = 8 and subpel_y_qn = 8.
+    constexpr int kSubX = 8;
+    constexpr int kSubY = 8;
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+    const uint8_t *input = FirstRandomInput8(GetParam());
+
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    // Use a stride different from width to avoid potential storing errors that
+    // would go undetected. The input buffer is filled using a padding of 12, so
+    // the stride can be anywhere between width and width + 12.
+    av1_convolve_2d_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+                                 width, height, filter_params_x,
+                                 filter_params_y, kSubX, kSubY, &conv_params1);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+                              height, filter_params_x, filter_params_y, kSubX,
+                              kSubY, &conv_params2);
+
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void SpeedTest() {
+    constexpr int kNumIters = 10000;
+    const InterpFilter h_f = static_cast<InterpFilter>(BILINEAR);
+    const InterpFilter v_f = static_cast<InterpFilter>(BILINEAR);
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+    const uint8_t *input = FirstRandomInput8(GetParam());
+
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      av1_convolve_2d_sr_intrabc_c(input, width, reference, kOutputStride,
+                                   width, height, filter_params_x,
+                                   filter_params_y, 8, 8, &conv_params1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    convolve_2d_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_x, filter_params_y, 8, 8, &conv_params2);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+           time1, time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1Convolve2DIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DIntraBCTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DIntraBCTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_intrabc_neon));
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////////////
 // Single reference convolve-2d functions (high bit-depth)
@@ -1087,6 +1523,103 @@
                          BuildHighbdParams(av1_highbd_convolve_2d_sr_neon));
 #endif
 
+//////////////////////////////////////////////////////////////////
+// Single reference convolve-2d IntraBC functions (high bit-depth)
+//////////////////////////////////////////////////////////////////
+
+class AV1Convolve2DHighbdIntraBCTest
+    : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+  void RunTest() {
+    // IntraBC functions only operate for subpel_x_qn = 8 and subpel_y_qn = 8.
+    constexpr int kSubX = 8;
+    constexpr int kSubY = 8;
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+    const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+    const uint16_t *input = FirstRandomInput16(GetParam());
+
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+    // Use a stride different from width to avoid potential storing errors that
+    // would go undetected. The input buffer is filled using a padding of 12, so
+    // the stride can be anywhere between width and width + 12.
+    av1_highbd_convolve_2d_sr_intrabc_c(input, width + 2, reference,
+                                        kOutputStride, width, height,
+                                        filter_params_x, filter_params_y, kSubX,
+                                        kSubY, &conv_params1, bit_depth);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+    GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+                              height, filter_params_x, filter_params_y, kSubX,
+                              kSubY, &conv_params2, bit_depth);
+
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void SpeedTest() {
+    constexpr int kNumIters = 10000;
+    const InterpFilter h_f = static_cast<InterpFilter>(BILINEAR);
+    const InterpFilter v_f = static_cast<InterpFilter>(BILINEAR);
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      av1_highbd_convolve_2d_sr_intrabc_c(
+          input, width, reference, kOutputStride, width, height,
+          filter_params_x, filter_params_y, 0, 0, &conv_params1, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    highbd_convolve_2d_func test_func = GetParam().TestFunction();
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < kNumIters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_x, filter_params_y, 0, 0, &conv_params2,
+                bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+           time1, time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1Convolve2DHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Convolve2DHighbdIntraBCTest,
+    BuildHighbdParams(av1_highbd_convolve_2d_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1Convolve2DHighbdIntraBCTest,
+    BuildHighbdParams(av1_highbd_convolve_2d_sr_intrabc_neon));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 //////////////////////////
@@ -1304,6 +1837,18 @@
                          BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon));
 #endif
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, AV1ConvolveXCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+    NEON_I8MM, AV1ConvolveXCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon_i8mm));
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
 /////////////////////////////////////////////////
 // Compound convolve-x functions (high bit-depth)
@@ -1787,6 +2332,18 @@
                          BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon));
 #endif
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, AV1Convolve2DCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+    NEON_I8MM, AV1Convolve2DCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon_i8mm));
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////
 // Compound convolve-2d functions (high bit-depth)
diff --git a/test/av1_encoder_parms_get_to_decoder.cc b/test/av1_encoder_parms_get_to_decoder.cc
index e81ad87..402e70c 100644
--- a/test/av1_encoder_parms_get_to_decoder.cc
+++ b/test/av1_encoder_parms_get_to_decoder.cc
@@ -85,17 +85,17 @@
   AVxEncoderParmsGetToDecoder()
       : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
-  virtual ~AVxEncoderParmsGetToDecoder() {}
+  ~AVxEncoderParmsGetToDecoder() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 25;
     test_video_ = kAV1ParamPassingTestVector;
     cfg_.rc_target_bitrate = test_video_.bitrate;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 3);
       encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
@@ -113,8 +113,8 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     (void)pts;
     if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
       EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w);
@@ -127,14 +127,14 @@
     EXPECT_EQ(encode_parms.chroma_sample_position, img.csp);
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (encode_parms.lossless) {
       EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]);
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     return AOM_CODEC_OK == res_dec;
   }
diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
index 5eaf382..59c44ca 100644
--- a/test/av1_ext_tile_test.cc
+++ b/test/av1_ext_tile_test.cc
@@ -58,12 +58,12 @@
     tile_md5_.clear();
   }
 
-  virtual ~AV1ExtTileTest() {
+  ~AV1ExtTileTest() override {
     aom_img_free(&tile_img_);
     delete decoder_;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
 
     cfg_.g_lag_in_frames = 0;
@@ -74,8 +74,8 @@
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       // Encode setting
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -96,8 +96,8 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     // Skip 1 already decoded frame to be consistent with the decoder in this
     // test.
     if (pts == (aom_codec_pts_t)kSkip) return;
@@ -108,7 +108,7 @@
     md5_.push_back(md5_res.Get());
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     // Skip decoding 1 frame.
     if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return;
 
diff --git a/test/av1_external_partition_test.cc b/test/av1_external_partition_test.cc
index 41fc96c..88f6216 100644
--- a/test/av1_external_partition_test.cc
+++ b/test/av1_external_partition_test.cc
@@ -247,9 +247,9 @@
   ExternalPartitionTestAPI()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
-  virtual ~ExternalPartitionTestAPI() {}
+  ~ExternalPartitionTestAPI() override {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -260,14 +260,14 @@
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
@@ -287,8 +287,8 @@
     decision_mode_ = mode;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       if (decision_mode_ == AOM_EXT_PART_WHOLE_TREE) {
         aom_ext_part_funcs_t ext_part_funcs;
@@ -559,9 +559,9 @@
   ExternalPartitionTestDfsAPI()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
-  virtual ~ExternalPartitionTestDfsAPI() {}
+  ~ExternalPartitionTestDfsAPI() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -572,14 +572,14 @@
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
@@ -597,8 +597,8 @@
     test_send_features_ = test_send_features;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       aom_ext_part_funcs_t ext_part_funcs;
       ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
diff --git a/test/av1_fwd_txfm1d_test.cc b/test/av1_fwd_txfm1d_test.cc
index 885a6db..6bae9f8 100644
--- a/test/av1_fwd_txfm1d_test.cc
+++ b/test/av1_fwd_txfm1d_test.cc
@@ -41,7 +41,7 @@
 };
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
-const int8_t cos_bit = 14;
+const int8_t cos_bit = 13;
 const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
 
 TEST(av1_fwd_txfm1d, round_shift) {
@@ -56,7 +56,7 @@
 }
 
 TEST(av1_fwd_txfm1d, av1_cospi_arr_data) {
-  for (int i = 0; i < 7; i++) {
+  for (int i = 0; i < 4; i++) {
     for (int j = 0; j < 64; j++) {
       EXPECT_EQ(av1_cospi_arr_data[i][j],
                 (int32_t)round(cos(PI * j / 128) * (1 << (cos_bit_min + i))));
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index 7b84eb9..2ed5d94 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -38,7 +38,7 @@
 
 class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     tx_type_ = GET_PARAM(0);
     tx_size_ = GET_PARAM(1);
     max_error_ = GET_PARAM(2);
@@ -116,7 +116,7 @@
         << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(input_);
     aom_free(output_);
     aom_free(ref_input_);
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index dae53ea..2c57362 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -63,9 +63,9 @@
 
 class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
  public:
-  virtual ~AV1HighbdInvHTNxN() {}
+  ~AV1HighbdInvHTNxN() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     txfm_ref_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     inv_txfm_ref_ = GET_PARAM(2);
@@ -92,7 +92,7 @@
     ASSERT_NE(output_ref_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(input_);
     aom_free(coeffs_);
     aom_free(output_);
@@ -200,7 +200,7 @@
 class AV1HighbdInvTxfm2d
     : public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
  public:
-  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void SetUp() override { target_func_ = GET_PARAM(0); }
   void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
                            int bit_depth, int gt_int16 = 0);
 
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index 28ee534..e9cf02e 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -162,14 +162,13 @@
 class ConvolveHorizRSTestBase : public ::testing::Test {
  public:
   ConvolveHorizRSTestBase() : image_(nullptr) {}
-  virtual ~ConvolveHorizRSTestBase() {}
-  virtual void TearDown() {}
+  ~ConvolveHorizRSTestBase() override = default;
 
   // Implemented by subclasses (SetUp depends on the parameters passed
   // in and RunOne depends on the function to be tested. These can't
   // be templated for low/high bit depths because they have different
   // numbers of parameters)
-  virtual void SetUp() = 0;
+  void SetUp() override = 0;
   virtual void RunOne(bool ref) = 0;
 
  protected:
@@ -261,15 +260,15 @@
     : public ConvolveHorizRSTestBase<uint8_t>,
       public ::testing::WithParamInterface<LowBDParams> {
  public:
-  virtual ~LowBDConvolveHorizRSTest() {}
+  ~LowBDConvolveHorizRSTest() override = default;
 
-  void SetUp() {
+  void SetUp() override {
     tst_fun_ = GET_PARAM(0);
     const int bd = 8;
     SetBitDepth(bd);
   }
 
-  void RunOne(bool ref) {
+  void RunOne(bool ref) override {
     const uint8_t *src = image_->GetSrcData(ref, false);
     uint8_t *dst = image_->GetDstData(ref, false);
     const int src_stride = image_->src_stride();
@@ -322,15 +321,15 @@
     : public ConvolveHorizRSTestBase<uint16_t>,
       public ::testing::WithParamInterface<HighBDParams> {
  public:
-  virtual ~HighBDConvolveHorizRSTest() {}
+  ~HighBDConvolveHorizRSTest() override = default;
 
-  void SetUp() {
+  void SetUp() override {
     tst_fun_ = GET_PARAM(0);
     const int bd = GET_PARAM(1);
     SetBitDepth(bd);
   }
 
-  void RunOne(bool ref) {
+  void RunOne(bool ref) override {
     const uint16_t *src = image_->GetSrcData(ref, false);
     uint16_t *dst = image_->GetDstData(ref, false);
     const int src_stride = image_->src_stride();
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index dfa0481..35a87a4 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -49,7 +49,7 @@
 
 class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     tx_type_ = GET_PARAM(0);
     tx_size_ = GET_PARAM(1);
     max_error_ = GET_PARAM(2);
@@ -249,7 +249,7 @@
 typedef std::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
 class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
  public:
-  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void SetUp() override { target_func_ = GET_PARAM(0); }
   void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times,
                            int gt_int16 = 0);
 
@@ -393,8 +393,6 @@
                          ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
 #endif  // HAVE_AVX2
 
-// TODO(yunqing): Re-enable this unit test for NEON version after the functions
-// are fixed.
 #if HAVE_NEON
 extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
                                               uint8_t *output, int stride,
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index 99f0fba..7e66a8e 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -46,10 +46,8 @@
 class AV1KmeansTest1
     : public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
  public:
-  ~AV1KmeansTest1();
-  void SetUp();
-
-  void TearDown();
+  ~AV1KmeansTest1() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
@@ -75,7 +73,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest1);
 
-AV1KmeansTest1::~AV1KmeansTest1() {}
+AV1KmeansTest1::~AV1KmeansTest1() = default;
 
 void AV1KmeansTest1::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -87,8 +85,6 @@
   }
 }
 
-void AV1KmeansTest1::TearDown() {}
-
 void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
                                     BLOCK_SIZE bsize, int k) {
   const int w = block_size_wide[bsize];
@@ -152,10 +148,8 @@
 class AV1KmeansTest2
     : public ::testing::TestWithParam<av1_calc_indices_dim2Param> {
  public:
-  ~AV1KmeansTest2();
-  void SetUp();
-
-  void TearDown();
+  ~AV1KmeansTest2() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
@@ -185,7 +179,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest2);
 
-AV1KmeansTest2::~AV1KmeansTest2() {}
+AV1KmeansTest2::~AV1KmeansTest2() = default;
 
 void AV1KmeansTest2::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -197,8 +191,6 @@
   }
 }
 
-void AV1KmeansTest2::TearDown() {}
-
 void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
                                     BLOCK_SIZE bsize, int k) {
   const int w = block_size_wide[bsize];
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index 48504c8..4201ea6 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc
@@ -34,7 +34,7 @@
 
 class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
     // Allocate two massive buffers on the heap for edge weights and node bias
     // Then set-up the double-dimension arrays pointing into the big buffers
@@ -51,7 +51,7 @@
     }
     target_func_ = GET_PARAM(0);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(weights_buf);
     aom_free(bias_buf);
   }
@@ -65,8 +65,8 @@
  private:
   NnPredict_Func target_func_;
   libaom_test::ACMRandom rng_;
-  float *weights[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
-  float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
+  float *weights[NN_MAX_HIDDEN_LAYERS + 1] = {};
+  float *bias[NN_MAX_HIDDEN_LAYERS + 1] = {};
   float *weights_buf = nullptr, *bias_buf = nullptr;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NnPredictTest);
@@ -176,13 +176,15 @@
 // runs of the encoder.  It also conveniently covers all the kernels
 // implemented.
 static const NN_CONFIG kShapes[] = {
-  { 10, 16, 1, { 64 }, { 0 }, { 0 } }, { 12, 1, 1, { 12 }, { 0 }, { 0 } },
-  { 12, 1, 1, { 24 }, { 0 }, { 0 } },  { 12, 1, 1, { 32 }, { 0 }, { 0 } },
-  { 18, 4, 1, { 24 }, { 0 }, { 0 } },  { 18, 4, 1, { 32 }, { 0 }, { 0 } },
-  { 4, 1, 1, { 16 }, { 0 }, { 0 } },   { 8, 1, 1, { 16 }, { 0 }, { 0 } },
-  { 8, 4, 1, { 16 }, { 0 }, { 0 } },   { 8, 1, 1, { 24 }, { 0 }, { 0 } },
-  { 8, 1, 1, { 32 }, { 0 }, { 0 } },   { 8, 1, 1, { 64 }, { 0 }, { 0 } },
-  { 9, 3, 1, { 32 }, { 0 }, { 0 } },   { 4, 4, 1, { 8 }, { 0 }, { 0 } },
+  { 37, 1, 2, { 16, 24 }, {}, {} }, { 24, 24, 1, { 12 }, {}, {} },
+  { 10, 16, 1, { 64 }, {}, {} },    { 12, 1, 1, { 12 }, {}, {} },
+  { 12, 1, 1, { 24 }, {}, {} },     { 12, 1, 1, { 32 }, {}, {} },
+  { 18, 4, 1, { 24 }, {}, {} },     { 18, 4, 1, { 32 }, {}, {} },
+  { 4, 1, 1, { 16 }, {}, {} },      { 8, 1, 0, { 0 }, {}, {} },
+  { 8, 4, 1, { 16 }, {}, {} },      { 8, 1, 1, { 32 }, {}, {} },
+  { 9, 3, 1, { 32 }, {}, {} },      { 8, 4, 0, { 0 }, {}, {} },
+  { 8, 8, 0, { 0 }, {}, {} },       { 4, 4, 1, { 8 }, {}, {} },
+  { 4, 3, 0, { 64 }, {}, {} },
 };
 
 void NnPredictTest::RunNnPredictTest_all(const NN_CONFIG *const shapes,
@@ -206,14 +208,21 @@
                             10000000);
 }
 
-#if HAVE_SSE3 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+#if !CONFIG_EXCLUDE_SIMD_MISMATCH
+#if HAVE_SSE3
 INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest,
                          ::testing::Values(av1_nn_predict_sse3));
 #endif
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, NnPredictTest,
+                         ::testing::Values(av1_nn_predict_avx2));
+#endif
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, NnPredictTest,
                          ::testing::Values(av1_nn_predict_neon));
 #endif
+#endif  // !CONFIG_EXCLUDE_SIMD_MISMATCH
 
 }  // namespace
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 5823647..c8af14a 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -183,11 +183,9 @@
     }
   }
 
-  virtual void SetUp() { params_ = GetParam(); }
+  void SetUp() override { params_ = GetParam(); }
 
-  virtual void TearDown() {}
-
-  virtual ~AV1QuantizeTest() {}
+  ~AV1QuantizeTest() override = default;
 
  private:
   TX_SIZE getTxSize(int count) {
diff --git a/test/av1_round_shift_array_test.cc b/test/av1_round_shift_array_test.cc
index facb84b..937e864 100644
--- a/test/av1_round_shift_array_test.cc
+++ b/test/av1_round_shift_array_test.cc
@@ -39,10 +39,11 @@
 class AV1CompRoundShiftTest
     : public ::testing::TestWithParam<CompRoundShiftParam> {
  public:
-  ~AV1CompRoundShiftTest();
+  ~AV1CompRoundShiftTest() override;
 
-  void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
-  void TearDown() {}
+  void SetUp() override {
+    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  }
 
  protected:
   void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
@@ -54,7 +55,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompRoundShiftTest);
 
-AV1CompRoundShiftTest::~AV1CompRoundShiftTest() {}
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() = default;
 
 void AV1CompRoundShiftTest::RunCheckOutput(
     comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
diff --git a/test/av1_softmax_test.cc b/test/av1_softmax_test.cc
index 60c7b6f..2b04af1 100644
--- a/test/av1_softmax_test.cc
+++ b/test/av1_softmax_test.cc
@@ -35,7 +35,7 @@
 class FastSoftmaxTest : public ::testing::TestWithParam<FastSoftmaxTestParams> {
  public:
   FastSoftmaxTest() : target_fn_(GET_PARAM(0)), num_classes_(GET_PARAM(1)) {}
-  virtual void SetUp() {
+  void SetUp() override {
     ref_buf_.reset(new (std::nothrow) float[num_classes_]());
     ASSERT_NE(ref_buf_, nullptr);
     dst_buf_.reset(new (std::nothrow) float[num_classes_]());
diff --git a/test/av1_temporal_denoiser_test.cc b/test/av1_temporal_denoiser_test.cc
index 571fd92..7aa8fb6 100644
--- a/test/av1_temporal_denoiser_test.cc
+++ b/test/av1_temporal_denoiser_test.cc
@@ -43,11 +43,9 @@
     : public ::testing::Test,
       public ::testing::WithParamInterface<AV1DenoiserTestParam> {
  public:
-  virtual ~AV1DenoiserTest() {}
+  ~AV1DenoiserTest() override = default;
 
-  virtual void SetUp() { bs_ = GET_PARAM(1); }
-
-  virtual void TearDown() {}
+  void SetUp() override { bs_ = GET_PARAM(1); }
 
  protected:
   BLOCK_SIZE bs_;
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index 46f6d92..1055ff3 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc
@@ -379,6 +379,16 @@
     NEON, WedgeUtilsSSEOptTest,
     ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
                                     av1_wedge_sse_from_residuals_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+                                     av1_wedge_sign_from_residuals_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+                                   av1_wedge_compute_delta_squares_neon)));
 #endif  // HAVE_NEON
 
 #if HAVE_AVX2
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 8865915..d7817a8 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -35,7 +35,7 @@
       : width_(width), height_(height), source_data_(nullptr),
         source_stride_(0), bit_depth_(bit_depth) {}
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(source_data_);
     source_data_ = nullptr;
   }
@@ -47,7 +47,7 @@
   static const int kDataBlockHeight = 128;
   static const int kDataBlockSize = kDataBlockWidth * kDataBlockHeight;
 
-  virtual void SetUp() {
+  void SetUp() override {
     const testing::TestInfo *const test_info =
         testing::UnitTest::GetInstance()->current_test_info();
     // Skip the speed test for C code as the baseline uses the same function.
@@ -378,7 +378,7 @@
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = static_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_NE(source_data_, nullptr);
@@ -391,7 +391,7 @@
     ASSERT_NE(hbuf_c_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(source_data_);
     source_data_ = nullptr;
     aom_free(hbuf_c_);
@@ -469,7 +469,7 @@
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = static_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_NE(source_data_, nullptr);
@@ -482,7 +482,7 @@
     ASSERT_NE(vbuf_c_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(source_data_);
     source_data_ = nullptr;
     aom_free(vbuf_c_);
@@ -582,13 +582,13 @@
 class VectorVarTestBase : public ::testing::Test {
  public:
   explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
-  VectorVarTestBase() {}
-  ~VectorVarTestBase() {}
+  VectorVarTestBase() = default;
+  ~VectorVarTestBase() override = default;
 
  protected:
   static const int kDataAlignment = 16;
 
-  virtual void SetUp() {
+  void SetUp() override {
     width = 4 << m_bwl;
 
     ref_vector = static_cast<int16_t *>(
@@ -600,7 +600,7 @@
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(ref_vector);
     ref_vector = nullptr;
     aom_free(src_vector);
@@ -883,13 +883,13 @@
     satd_func_ref_ = func_param.func_ref;
     satd_func_simd_ = func_param.func_simd;
   }
-  virtual void SetUp() {
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<CoeffType *>(
         aom_memalign(32, sizeof(*src_) * satd_size_));
     ASSERT_NE(src_, nullptr);
   }
-  virtual void TearDown() { aom_free(src_); }
+  void TearDown() override { aom_free(src_); }
   void FillConstant(const CoeffType val) {
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
@@ -963,13 +963,13 @@
 };
 
 TEST_P(SatdTest, MinValue) {
-  const int kMin = -32640;
+  const int kMin = -524287;
   const int expected = -kMin * satd_size_;
   FillConstant(kMin);
   Check(expected);
 }
 TEST_P(SatdTest, MaxValue) {
-  const int kMax = 32640;
+  const int kMax = 524287;
   const int expected = kMax * satd_size_;
   FillConstant(kMax);
   Check(expected);
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index 9a95987..f9549bc 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc
@@ -41,13 +41,13 @@
   static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
   static const int kMaxMaskSize = kMaxMaskWidth;
 
-  virtual ~BlendA64Mask1DTest() {}
+  ~BlendA64Mask1DTest() override = default;
 
   virtual void Execute(const T *p_src0, const T *p_src1) = 0;
 
-  void Common() {
-    w_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
-    h_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+  void Common(int block_size) {
+    w_ = block_size_wide[block_size];
+    h_ = block_size_high[block_size];
 
     dst_offset_ = this->rng_(33);
     dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
@@ -121,7 +121,7 @@
 
 class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
  protected:
-  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) override {
     params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
                      src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
                      w_, h_);
@@ -132,7 +132,7 @@
 };
 
 TEST_P(BlendA64Mask1DTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = rng_.Rand8();
       dst_tst_[i] = rng_.Rand8();
@@ -144,23 +144,23 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    Common();
+    Common(bsize);
   }
 }
 
 TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(2) + 254;
-      dst_tst_[i] = rng_(2) + 254;
-      src0_[i] = rng_(2) + 254;
-      src1_[i] = rng_(2) + 254;
-    }
+  for (int i = 0; i < kBufSize; ++i) {
+    dst_ref_[i] = rng_(2) + 254;
+    dst_tst_[i] = rng_(2) + 254;
+    src0_[i] = rng_(2) + 254;
+    src1_[i] = rng_(2) + 254;
+  }
 
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+  for (int i = 0; i < kMaxMaskSize; ++i)
+    mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    Common();
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    Common(bsize);
   }
 }
 
@@ -227,7 +227,7 @@
 
 class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
  protected:
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) override {
     params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
                      CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
                      CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
@@ -243,37 +243,27 @@
 };
 
 TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    switch (rng_(3)) {
-      case 0: bit_depth_ = 8; break;
-      case 1: bit_depth_ = 10; break;
-      default: bit_depth_ = 12; break;
+  for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+    for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      const int hi = 1 << bit_depth_;
+
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = rng_(hi);
+        dst_tst_[i] = rng_(hi);
+        src0_[i] = rng_(hi);
+        src1_[i] = rng_(hi);
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+      Common(bsize);
     }
-
-    const int hi = 1 << bit_depth_;
-
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(hi);
-      dst_tst_[i] = rng_(hi);
-      src0_[i] = rng_(hi);
-      src1_[i] = rng_(hi);
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    Common();
   }
 }
 
 TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
-  for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
-    switch (rng_(3)) {
-      case 0: bit_depth_ = 8; break;
-      case 1: bit_depth_ = 10; break;
-      default: bit_depth_ = 12; break;
-    }
-
+  for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
     const int hi = 1 << bit_depth_;
     const int lo = hi - 2;
 
@@ -287,7 +277,9 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    Common();
+    for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      Common(bsize);
+    }
   }
 }
 
@@ -336,5 +328,15 @@
                       TestFuncsHBD(highbd_blend_a64_vmask_ref,
                                    aom_highbd_blend_a64_vmask_sse4_1)));
 #endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlendA64Mask1DTestHBD,
+    ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+                                   aom_highbd_blend_a64_hmask_neon),
+                      TestFuncsHBD(highbd_blend_a64_vmask_ref,
+                                   aom_highbd_blend_a64_vmask_neon)));
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 9dece57..fafc7f0 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -41,7 +41,7 @@
   static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
   static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
 
-  virtual ~BlendA64MaskTest() {}
+  ~BlendA64MaskTest() override = default;
 
   virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
                        int run_times) = 0;
@@ -123,9 +123,11 @@
   }
 
   void RunTest(int block_size, int run_times) {
-    subx_ = Rand1();
-    suby_ = Rand1();
-    RunOneTest(block_size, subx_, suby_, run_times);
+    for (subx_ = 0; subx_ <= 1; subx_++) {
+      for (suby_ = 0; suby_ <= 1; suby_++) {
+        RunOneTest(block_size, subx_, suby_, run_times);
+      }
+    }
   }
 
   DstPixel dst_ref_[kBufSize];
@@ -163,7 +165,8 @@
 
 class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
  protected:
-  void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1,
+               int run_times) override {
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < run_times; ++i) {
@@ -193,8 +196,7 @@
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B);
 
 TEST_P(BlendA64MaskTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize) {
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = rng_.Rand8();
       dst_tst_[i] = rng_.Rand8();
@@ -211,21 +213,20 @@
 }
 
 TEST_P(BlendA64MaskTest8B, ExtremeValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(2) + 254;
-      dst_tst_[i] = rng_(2) + 254;
-      src0_[i] = rng_(2) + 254;
-      src1_[i] = rng_(2) + 254;
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
-
-    RunTest(bsize, 1);
+  for (int i = 0; i < kBufSize; ++i) {
+    dst_ref_[i] = rng_(2) + 254;
+    dst_tst_[i] = rng_(2) + 254;
+    src0_[i] = rng_(2) + 254;
+    src1_[i] = rng_(2) + 254;
   }
+
+  for (int i = 0; i < kMaxMaskSize; ++i)
+    mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize)
+    RunTest(bsize, 1);
 }
+
 TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
   const int kRunTimes = 10000000;
   for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -240,10 +241,7 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    RunOneTest(bsize, 1, 1, kRunTimes);
-    RunOneTest(bsize, 1, 0, kRunTimes);
-    RunOneTest(bsize, 0, 1, kRunTimes);
-    RunOneTest(bsize, 0, 0, kRunTimes);
+    RunTest(bsize, kRunTimes);
   }
 }
 #if HAVE_SSE4_1
@@ -258,6 +256,12 @@
                                                      aom_blend_a64_mask_avx2)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendA64MaskTest8B,
+                         ::testing::Values(TestFuncs(aom_blend_a64_mask_c,
+                                                     aom_blend_a64_mask_neon)));
+#endif  // HAVE_NEON
+
 //////////////////////////////////////////////////////////////////////////////
 // 8 bit _d16 version
 //////////////////////////////////////////////////////////////////////////////
@@ -275,7 +279,8 @@
   // max number of bits used by the source
   static const int kSrcMaxBitsMask = 0x3fff;
 
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+               int run_times) override {
     ConvolveParams conv_params;
     conv_params.round_0 = ROUND0_BITS;
     conv_params.round_1 = COMPOUND_ROUND1_BITS;
@@ -308,8 +313,7 @@
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B_d16);
 
 TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize) {
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = rng_.Rand8();
       dst_tst_[i] = rng_.Rand8();
@@ -326,20 +330,35 @@
 }
 
 TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = 255;
-      dst_tst_[i] = 255;
+  for (int i = 0; i < kBufSize; ++i) {
+    dst_ref_[i] = 255;
+    dst_tst_[i] = 255;
 
-      src0_[i] = kSrcMaxBitsMask;
-      src1_[i] = kSrcMaxBitsMask;
+    src0_[i] = kSrcMaxBitsMask;
+    src1_[i] = kSrcMaxBitsMask;
+  }
+
+  for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize)
+    RunTest(bsize, 1);
+}
+
+TEST_P(BlendA64MaskTest8B_d16, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+      src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
     }
 
     for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    RunTest(bsize, 1);
+    RunTest(bsize, kRunTimes);
   }
 }
 
@@ -377,7 +396,8 @@
 
 class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
  protected:
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+               int run_times) override {
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < run_times; ++i) {
@@ -409,53 +429,46 @@
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTestHBD);
 
 TEST_P(BlendA64MaskTestHBD, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    switch (rng_(3)) {
-      case 0: bit_depth_ = 8; break;
-      case 1: bit_depth_ = 10; break;
-      default: bit_depth_ = 12; break;
-    }
-
+  for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+       bit_depth_ += 2) {
     const int hi = 1 << bit_depth_;
 
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(hi);
-      dst_tst_[i] = rng_(hi);
-      src0_[i] = rng_(hi);
-      src1_[i] = rng_(hi);
+    for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = rng_(hi);
+        dst_tst_[i] = rng_(hi);
+        src0_[i] = rng_(hi);
+        src1_[i] = rng_(hi);
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+      RunTest(bsize, 1);
     }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    RunTest(bsize, 1);
   }
 }
 
 TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
-  for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    switch (rng_(3)) {
-      case 0: bit_depth_ = 8; break;
-      case 1: bit_depth_ = 10; break;
-      default: bit_depth_ = 12; break;
-    }
-
+  for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+       bit_depth_ += 2) {
     const int hi = 1 << bit_depth_;
     const int lo = hi - 2;
 
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(hi - lo) + lo;
-      dst_tst_[i] = rng_(hi - lo) + lo;
-      src0_[i] = rng_(hi - lo) + lo;
-      src1_[i] = rng_(hi - lo) + lo;
+    for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure();
+         ++bsize) {
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = rng_(hi - lo) + lo;
+        dst_tst_[i] = rng_(hi - lo) + lo;
+        src0_[i] = rng_(hi - lo) + lo;
+        src1_[i] = rng_(hi - lo) + lo;
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+      RunTest(bsize, 1);
     }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
-
-    RunTest(bsize, 1);
   }
 }
 
@@ -466,6 +479,13 @@
                                    aom_highbd_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlendA64MaskTestHBD,
+    ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+                                   aom_highbd_blend_a64_mask_neon)));
+#endif  // HAVE_NEON
+
 //////////////////////////////////////////////////////////////////////////////
 // HBD _d16 version
 //////////////////////////////////////////////////////////////////////////////
@@ -485,7 +505,8 @@
   static const int kSrcMaxBitsMask = (1 << 14) - 1;
   static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
 
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+               int run_times) override {
     ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
     ConvolveParams conv_params;
     conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
@@ -526,51 +547,49 @@
 
 TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
   if (params_.tst_func == nullptr) return;
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    switch (rng_(3)) {
-      case 0: bit_depth_ = 8; break;
-      case 1: bit_depth_ = 10; break;
-      default: bit_depth_ = 12; break;
-    }
+  for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+       bit_depth_ += 2) {
     src_max_bits_mask_ =
         (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
 
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_.Rand8();
-      dst_tst_[i] = rng_.Rand8();
-
-      src0_[i] = rng_.Rand16() & src_max_bits_mask_;
-      src1_[i] = rng_.Rand16() & src_max_bits_mask_;
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    RunTest(bsize, 1);
-  }
-}
-// TODO (Scott LaVarnway), fix this test
-TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) {
-  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
-    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
-      src_max_bits_mask_ =
-          (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
-
+    for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure();
+         ++bsize) {
       for (int i = 0; i < kBufSize; ++i) {
-        dst_ref_[i] = 0;
-        dst_tst_[i] = (1 << bit_depth_) - 1;
+        dst_ref_[i] = rng_.Rand8();
+        dst_tst_[i] = rng_.Rand8();
 
-        src0_[i] = src_max_bits_mask_;
-        src1_[i] = src_max_bits_mask_;
+        src0_[i] = rng_.Rand16() & src_max_bits_mask_;
+        src1_[i] = rng_.Rand16() & src_max_bits_mask_;
       }
 
-      for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
       RunTest(bsize, 1);
     }
   }
 }
+
+TEST_P(BlendA64MaskTestHBD_d16, ExtremeValues) {
+  for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+    src_max_bits_mask_ =
+        (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = 0;
+      dst_tst_[i] = (1 << bit_depth_) - 1;
+
+      src0_[i] = src_max_bits_mask_;
+      src1_[i] = src_max_bits_mask_;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+    for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      RunTest(bsize, 1);
+    }
+  }
+}
+
 TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
   const int kRunTimes = 10000000;
   for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -586,15 +605,15 @@
       for (int i = 0; i < kMaxMaskSize; ++i)
         mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-      RunOneTest(bsize, 1, 1, kRunTimes);
-      RunOneTest(bsize, 0, 0, kRunTimes);
+      RunTest(bsize, kRunTimes);
     }
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(C, BlendA64MaskTestHBD_d16,
-                         ::testing::Values(TestFuncsHBD_d16(
-                             aom_highbd_blend_a64_d16_mask_c, nullptr)));
+INSTANTIATE_TEST_SUITE_P(
+    C, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_c)));
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
@@ -610,6 +629,13 @@
                                        aom_highbd_blend_a64_d16_mask_avx2)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_neon)));
+#endif  // HAVE_NEON
+
 // TODO(slavarnway): Enable the following in the avx2 commit. (56501)
 #if 0
 #if HAVE_AVX2
diff --git a/test/block_test.cc b/test/block_test.cc
index 74deee3..686180c 100644
--- a/test/block_test.cc
+++ b/test/block_test.cc
@@ -140,9 +140,9 @@
         superblock_size_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
     sb_size_violated_ = false;
   }
-  virtual ~SuperBlockSizeTestLarge() {}
+  ~SuperBlockSizeTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -152,10 +152,10 @@
     cfg_.rc_target_bitrate = 1000;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -163,8 +163,8 @@
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec &&
         superblock_size_ != AOM_SUPERBLOCK_SIZE_DYNAMIC) {
diff --git a/test/borders_test.cc b/test/borders_test.cc
index bf9cc8b..594c3e8 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -24,12 +24,12 @@
       public ::libaom_test::EncoderTest {
  protected:
   BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~BordersTestLarge() {}
+  ~BordersTestLarge() override = default;
 
-  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+  void SetUp() override { InitializeConfig(GET_PARAM(1)); }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 1);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -38,7 +38,7 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & AOM_FRAME_IS_KEY) {
     }
   }
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index 3f971be..ad54407 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc
@@ -38,8 +38,8 @@
 
 class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
  public:
-  virtual ~CDEFBlockTest() {}
-  virtual void SetUp() {
+  ~CDEFBlockTest() override = default;
+  void SetUp() override {
     cdef = GET_PARAM(0);
     ref_cdef = GET_PARAM(1);
     bsize = GET_PARAM(2);
@@ -47,10 +47,8 @@
     depth = GET_PARAM(4);
   }
 
-  virtual void TearDown() {}
-
  protected:
-  int bsize;
+  BLOCK_SIZE bsize;
   int boundary;
   int depth;
   CdefFilterBlockFunctions cdef;
@@ -67,7 +65,8 @@
 typedef CDEFBlockTest CDEFSpeedHighbdTest;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedHighbdTest);
 
-int64_t test_cdef(int bsize, int iterations, CdefFilterBlockFunctions cdef,
+int64_t test_cdef(BLOCK_SIZE bsize, int iterations,
+                  CdefFilterBlockFunctions cdef,
                   CdefFilterBlockFunctions ref_cdef, int boundary, int depth) {
   aom_usec_timer ref_timer;
   int64_t ref_elapsed_time = 0;
@@ -188,7 +187,8 @@
   return ref_elapsed_time;
 }
 
-void test_cdef_speed(int bsize, int iterations, CdefFilterBlockFunctions cdef,
+void test_cdef_speed(BLOCK_SIZE bsize, int iterations,
+                     CdefFilterBlockFunctions cdef,
                      CdefFilterBlockFunctions ref_cdef, int boundary,
                      int depth) {
   int64_t ref_elapsed_time =
@@ -213,14 +213,12 @@
 
 class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
  public:
-  virtual ~CDEFFindDirTest() {}
-  virtual void SetUp() {
+  ~CDEFFindDirTest() override = default;
+  void SetUp() override {
     finddir = GET_PARAM(0);
     ref_finddir = GET_PARAM(1);
   }
 
-  virtual void TearDown() {}
-
  protected:
   find_dir_t finddir;
   find_dir_t ref_finddir;
@@ -304,14 +302,12 @@
 class CDEFFindDirDualTest
     : public ::testing::TestWithParam<find_dir_dual_param_t> {
  public:
-  virtual ~CDEFFindDirDualTest() {}
-  virtual void SetUp() {
+  ~CDEFFindDirDualTest() override = default;
+  void SetUp() override {
     finddir = GET_PARAM(0);
     ref_finddir = GET_PARAM(1);
   }
 
-  virtual void TearDown() {}
-
  protected:
   find_dir_dual_t finddir;
   find_dir_dual_t ref_finddir;
@@ -405,6 +401,177 @@
       ref_elapsed_time, elapsed_time, ref_elapsed_time / elapsed_time);
 }
 
+#define MAX_CDEF_BLOCK 256
+
+constexpr int kIterations = 100;
+
+using CDEFCopyRect8To16 = void (*)(uint16_t *dst, int dstride,
+                                   const uint8_t *src, int sstride, int width,
+                                   int height);
+
+using CDEFCopyRect8To16Param = std::tuple<CDEFCopyRect8To16, CDEFCopyRect8To16>;
+
+class CDEFCopyRect8to16Test
+    : public ::testing::TestWithParam<CDEFCopyRect8To16Param> {
+ public:
+  CDEFCopyRect8to16Test()
+      : rnd_(libaom_test::ACMRandom::DeterministicSeed()),
+        test_func_(GET_PARAM(0)), ref_func_(GET_PARAM(1)) {}
+  ~CDEFCopyRect8to16Test() override = default;
+  void SetUp() override {
+    src_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(8, sizeof(uint8_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+    ASSERT_NE(src_, nullptr);
+    ref_dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+    ASSERT_NE(ref_dst_, nullptr);
+    test_dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+    ASSERT_NE(test_dst_, nullptr);
+  }
+
+  void TearDown() override {
+    aom_free(src_);
+    aom_free(ref_dst_);
+    aom_free(test_dst_);
+  }
+
+  void test_copy_rect_8_to_16(CDEFCopyRect8To16 test_func,
+                              CDEFCopyRect8To16 ref_func) {
+    constexpr int stride = MAX_CDEF_BLOCK;
+    int error = 0;
+    for (int k = 0; k < kIterations && !error; k++) {
+      // This function operates on values of width that are either 4 or a
+      // multiple of 8. For height, generate a random value between 1 and 256,
+      // making sure it is even.
+      const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8;
+      const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2;
+      for (int i = 0; i < height; i++) {
+        for (int j = 0; j < width; j++) {
+          src_[i * stride + j] = rnd_.Rand8();
+        }
+      }
+
+      ref_func(ref_dst_, stride, src_, stride, width, height);
+      test_func(test_dst_, stride, src_, stride, width, height);
+
+      int i, j;
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+          if (test_dst_[i * stride + j] != ref_dst_[i * stride + j]) {
+            error = 1;
+            break;
+          }
+        }
+        if (error) {
+          break;
+        }
+      }
+      EXPECT_EQ(0, error)
+          << "Error: CDEFCopyRect8to16Test, SIMD and C mismatch." << std::endl
+          << "First error at " << i << "," << j << " ("
+          << ref_dst_[i * stride + j] << " : " << test_dst_[i * stride + j]
+          << ") " << std::endl
+          << "width: " << width << std::endl
+          << "height: " << height << std::endl
+          << std::endl;
+    }
+  }
+
+ protected:
+  libaom_test::ACMRandom rnd_;
+  uint8_t *src_;
+  uint16_t *ref_dst_;
+  uint16_t *test_dst_;
+  CDEFCopyRect8To16 test_func_;
+  CDEFCopyRect8To16 ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFCopyRect8to16Test);
+
+using CDEFCopyRect16To16 = void (*)(uint16_t *dst, int dstride,
+                                    const uint16_t *src, int sstride, int width,
+                                    int height);
+
+using CDEFCopyRect16To16Param =
+    std::tuple<CDEFCopyRect16To16, CDEFCopyRect16To16>;
+
+class CDEFCopyRect16to16Test
+    : public ::testing::TestWithParam<CDEFCopyRect16To16Param> {
+ public:
+  CDEFCopyRect16to16Test()
+      : rnd_(libaom_test::ACMRandom::DeterministicSeed()),
+        test_func_(GET_PARAM(0)), ref_func_(GET_PARAM(1)) {}
+  ~CDEFCopyRect16to16Test() override = default;
+  void SetUp() override {
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+    ASSERT_NE(src_, nullptr);
+    ref_dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+    ASSERT_NE(ref_dst_, nullptr);
+    test_dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+    ASSERT_NE(test_dst_, nullptr);
+  }
+
+  void TearDown() override {
+    aom_free(src_);
+    aom_free(ref_dst_);
+    aom_free(test_dst_);
+  }
+
+  void test_copy_rect_16_to_16(CDEFCopyRect16To16 test_func,
+                               CDEFCopyRect16To16 ref_func) {
+    constexpr int stride = MAX_CDEF_BLOCK;
+    int error = 0;
+    for (int k = 0; k < kIterations && !error; k++) {
+      // This function operates on values of width that are either 4 or a
+      // multiple of 8. For height, generate a random value between 1 and 256,
+      // making sure it is even.
+      const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8;
+      const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2;
+      for (int i = 0; i < height; i++) {
+        for (int j = 0; j < width; j++) {
+          src_[i * stride + j] = rnd_.Rand16();
+        }
+      }
+
+      ref_func(ref_dst_, stride, src_, stride, width, height);
+      test_func(test_dst_, stride, src_, stride, width, height);
+
+      int i, j;
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+          if (test_dst_[i * stride + j] != ref_dst_[i * stride + j]) {
+            error = 1;
+            break;
+          }
+        }
+        if (error) {
+          break;
+        }
+      }
+      EXPECT_EQ(0, error)
+          << "Error: CDEFCopyRect16to16Test, SIMD and C mismatch." << std::endl
+          << "First error at " << i << "," << j << " ("
+          << ref_dst_[i * stride + j] << " : " << test_dst_[i * stride + j]
+          << ") " << std::endl
+          << "width: " << width << std::endl
+          << "height: " << height << std::endl
+          << std::endl;
+    }
+  }
+
+ protected:
+  libaom_test::ACMRandom rnd_;
+  uint16_t *src_;
+  uint16_t *ref_dst_;
+  uint16_t *test_dst_;
+  CDEFCopyRect16To16 test_func_;
+  CDEFCopyRect16To16 ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFCopyRect16to16Test);
+
 TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
   test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
 }
@@ -437,6 +604,14 @@
   test_finddir_dual_speed(finddir, ref_finddir);
 }
 
+TEST_P(CDEFCopyRect8to16Test, TestSIMDNoMismatch) {
+  test_copy_rect_8_to_16(test_func_, ref_func_);
+}
+
+TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
+  test_copy_rect_16_to_16(test_func_, ref_func_);
+}
+
 using std::make_tuple;
 
 #if (HAVE_SSE2 || HAVE_SSSE3 || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
@@ -482,6 +657,16 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2,
                                                       &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_sse2)));
 #endif
 
 #if HAVE_SSSE3
@@ -515,6 +700,16 @@
 INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
                                                       &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_ssse3)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_ssse3)));
 #endif
 
 #if HAVE_SSE4_1
@@ -549,6 +744,16 @@
     SSE4_1, CDEFFindDirDualTest,
     ::testing::Values(make_tuple(&cdef_find_dir_dual_sse4_1,
                                  &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_sse4_1)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_sse4_1)));
 #endif
 
 #if HAVE_AVX2
@@ -582,6 +787,16 @@
 INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirDualTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_dual_avx2,
                                                       &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_avx2)));
 #endif
 
 #if HAVE_NEON
@@ -615,6 +830,16 @@
 INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_dual_neon,
                                                       &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_neon)));
 #endif
 
 // Test speed for all supported architectures
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 97533da..7fdea04 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -88,7 +88,7 @@
 
 class CFLTest {
  public:
-  virtual ~CFLTest() {}
+  virtual ~CFLTest() = default;
   void init(TX_SIZE tx) {
     tx_size = tx;
     width = tx_size_wide[tx_size];
@@ -106,7 +106,7 @@
 template <typename I>
 class CFLTestWithData : public CFLTest {
  public:
-  virtual ~CFLTestWithData() {}
+  ~CFLTestWithData() override = default;
 
  protected:
   I data[CFL_BUF_SQUARE];
@@ -125,7 +125,7 @@
 template <typename I>
 class CFLTestWithAlignedData : public CFLTest {
  public:
-  ~CFLTestWithAlignedData() {
+  ~CFLTestWithAlignedData() override {
     aom_free(chroma_pels_ref);
     aom_free(sub_luma_pels_ref);
     aom_free(chroma_pels);
@@ -177,12 +177,12 @@
 class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
                       public CFLTestWithData<int16_t> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     CFLTest::init(std::get<0>(this->GetParam()));
     sub_avg = std::get<1>(this->GetParam())(tx_size);
     sub_avg_ref = cfl_get_subtract_average_fn_c(tx_size);
   }
-  virtual ~CFLSubAvgTest() {}
+  ~CFLSubAvgTest() override = default;
 
  protected:
   cfl_subtract_average_fn sub_avg;
@@ -223,7 +223,7 @@
 class CFLSubsampleTest : public ::testing::TestWithParam<S>,
                          public CFLTestWithData<I> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     CFLTest::init(std::get<0>(this->GetParam()));
     fun_420 = std::get<1>(this->GetParam())(this->tx_size);
     fun_422 = std::get<2>(this->GetParam())(this->tx_size);
@@ -284,8 +284,8 @@
     : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
                               uint8_t> {
  public:
-  virtual ~CFLSubsampleLBDTest() {}
-  virtual void SetUp() {
+  ~CFLSubsampleLBDTest() override = default;
+  void SetUp() override {
     CFLSubsampleTest::SetUp();
     fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
     fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
@@ -328,8 +328,8 @@
     : public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
                               uint16_t> {
  public:
-  virtual ~CFLSubsampleHBDTest() {}
-  virtual void SetUp() {
+  ~CFLSubsampleHBDTest() override = default;
+  void SetUp() override {
     CFLSubsampleTest::SetUp();
     fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
     fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
@@ -369,13 +369,13 @@
 class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
                        public CFLTestWithAlignedData<uint8_t> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     CFLTest::init(std::get<0>(this->GetParam()));
     CFLTestWithAlignedData::init();
     predict = std::get<1>(this->GetParam())(tx_size);
     predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
   }
-  virtual ~CFLPredictTest() {}
+  ~CFLPredictTest() override = default;
 
  protected:
   cfl_predict_lbd_fn predict;
@@ -418,13 +418,13 @@
 class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
                           public CFLTestWithAlignedData<uint16_t> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     CFLTest::init(std::get<0>(this->GetParam()));
     CFLTestWithAlignedData::init();
     predict = std::get<1>(this->GetParam())(tx_size);
     predict_ref = cfl_get_predict_hbd_fn_c(tx_size);
   }
-  virtual ~CFLPredictHBDTest() {}
+  ~CFLPredictHBDTest() override = default;
 
  protected:
   cfl_predict_hbd_fn predict;
diff --git a/test/cnn_test.cc b/test/cnn_test.cc
index 77d8d55..127ed3d 100644
--- a/test/cnn_test.cc
+++ b/test/cnn_test.cc
@@ -2520,7 +2520,7 @@
 
 class CNNConvolveTest : public ::testing::TestWithParam<CNNConvolveTestFuncs> {
  protected:
-  virtual void SetUp() { params_ = GetParam(); }
+  void SetUp() override { params_ = GetParam(); }
 
   void RunCNNConvolveSetup(int run_times) {
     int in_width = 65;
diff --git a/test/codec_factory.h b/test/codec_factory.h
index d768d2e..7ffc465 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -32,9 +32,9 @@
 
 class CodecFactory {
  public:
-  CodecFactory() {}
+  CodecFactory() = default;
 
-  virtual ~CodecFactory() {}
+  virtual ~CodecFactory() = default;
 
   virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const = 0;
 
@@ -95,7 +95,7 @@
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual aom_codec_iface_t *CodecInterface() const {
+  aom_codec_iface_t *CodecInterface() const override {
 #if CONFIG_AV1_DECODER
     return aom_codec_av1_dx();
 #else
@@ -111,7 +111,7 @@
       : Encoder(cfg, init_flags, stats) {}
 
  protected:
-  virtual aom_codec_iface_t *CodecInterface() const {
+  aom_codec_iface_t *CodecInterface() const override {
 #if CONFIG_AV1_ENCODER
     return aom_codec_av1_cx();
 #else
@@ -124,12 +124,12 @@
  public:
   AV1CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
-                                 const aom_codec_flags_t flags) const {
+  Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+                         const aom_codec_flags_t flags) const override {
 #if CONFIG_AV1_DECODER
     return new AV1Decoder(cfg, flags);
 #else
@@ -139,9 +139,9 @@
 #endif
   }
 
-  virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
-                                 const aom_codec_flags_t init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+                         const aom_codec_flags_t init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_AV1_ENCODER
     return new AV1Encoder(cfg, init_flags, stats);
 #else
@@ -152,8 +152,8 @@
 #endif
   }
 
-  virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
-                                               unsigned int usage) const {
+  aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+                                       unsigned int usage) const override {
 #if CONFIG_AV1_ENCODER
     return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
 #else
diff --git a/test/coding_path_sync.cc b/test/coding_path_sync.cc
index c3e51fd..f7b7eac 100644
--- a/test/coding_path_sync.cc
+++ b/test/coding_path_sync.cc
@@ -120,9 +120,9 @@
   int width_, height_;
 };
 
-// lowers an aom_image_t to a easily comparable/printable form
-std::vector<int16_t> Serialize(const aom_image_t *img) {
-  std::vector<int16_t> bytes;
+// lowers an aom_image_t to an easily comparable/printable form
+std::vector<uint16_t> Serialize(const aom_image_t *img) {
+  std::vector<uint16_t> bytes;
   bytes.reserve(img->d_w * img->d_h * 3);
   for (int plane = 0; plane < 3; ++plane) {
     const int w = aom_img_plane_width(img, plane);
@@ -130,11 +130,13 @@
 
     for (int r = 0; r < h; ++r) {
       for (int c = 0; c < w; ++c) {
-        unsigned char *row = img->planes[plane] + r * img->stride[plane];
-        if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
-          bytes.push_back(row[c * 2]);
-        else
+        const unsigned char *row = img->planes[plane] + r * img->stride[plane];
+        if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+          const uint16_t *row16 = reinterpret_cast<const uint16_t *>(row);
+          bytes.push_back(row16[c]);
+        } else {
           bytes.push_back(row[c]);
+        }
       }
     }
   }
@@ -155,7 +157,7 @@
 
   ~Decoder() { aom_codec_destroy(&dec_); }
 
-  std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
+  std::vector<uint16_t> decode(const aom_codec_cx_pkt_t *pkt) {
     aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
                      pkt->data.frame.sz, nullptr);
 
@@ -179,8 +181,8 @@
     for (int k = 0; k < 3; ++k) {
       const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
 
-      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
-      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+      std::vector<uint16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<uint16_t> hbd_yuv = dec_hbd.decode(frame);
 
       ASSERT_EQ(lbd_yuv, hbd_yuv);
     }
@@ -199,8 +201,8 @@
     for (int k = 0; k < 5; ++k) {
       const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
 
-      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
-      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+      std::vector<uint16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<uint16_t> hbd_yuv = dec_hbd.decode(frame);
 
       ASSERT_EQ(lbd_yuv, hbd_yuv);
     }
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 4218ac3..2f81d7e 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -13,9 +13,12 @@
 
 using libaom_test::ACMRandom;
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgParam;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
+using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
 #if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
@@ -26,6 +29,19 @@
 using std::make_tuple;
 using std::tuple;
 
+uint8_t *DistWtdCompAvgTest::reference_data_ = nullptr;
+uint8_t *DistWtdCompAvgTest::second_pred_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred_test_ = nullptr;
+uint8_t *DistWtdCompAvgTest::reference_data8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::second_pred8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred8_test_ = nullptr;
+uint16_t *DistWtdCompAvgTest::reference_data16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::second_pred16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::comp_pred16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::comp_pred16_test_ = nullptr;
+
 namespace {
 
 TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
@@ -52,6 +68,141 @@
                              aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_dist_wtd_comp_avg_upsampled_pred_neon));
+#endif  // HAVE_NEON
+
+TEST_P(DistWtdCompAvgTest, MaxRef) {
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, mask_);
+  CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdCompAvgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+// TODO(chengchen): add highbd tests
+const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+
+#if HAVE_SSSE3
+const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+const DistWtdCompAvgParam dist_wtd_comp_avg_neon_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_neon_tests));
+#endif  // HAVE_NEON
+
 #if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
@@ -67,6 +218,12 @@
                              aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_highbd_dist_wtd_comp_avg_pred_neon, 1));
+#endif
+
 TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }
@@ -80,6 +237,13 @@
                          libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
                              aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
 #endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 }  // namespace
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index c1526d8..396df2e 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h
@@ -40,11 +40,18 @@
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
     int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
 
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, const uint8_t *ref,
+                                   int ref_stride,
+                                   const DIST_WTD_COMP_PARAMS *jcp_param);
+
 typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
 
 typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
     DISTWTDCOMPAVGUPSAMPLEDParam;
 
+typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
 #if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*highbddistwtdcompavgupsampled_func)(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
@@ -90,8 +97,8 @@
 class AV1DISTWTDCOMPAVGTest
     : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
  public:
-  ~AV1DISTWTDCOMPAVGTest() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  ~AV1DISTWTDCOMPAVGTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
   void RunCheckOutput(distwtdcompavg_func test_impl) {
@@ -193,8 +200,8 @@
 class AV1DISTWTDCOMPAVGUPSAMPLEDTest
     : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
  public:
-  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
   void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
@@ -317,12 +324,198 @@
   libaom_test::ACMRandom rnd_;
 };  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
 
+class DistWtdCompAvgTest
+    : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+      public ::testing::Test {
+ public:
+  DistWtdCompAvgTest()
+      : width_(GET_PARAM(0)), height_(GET_PARAM(1)), bd_(GET_PARAM(3)) {}
+
+  static void SetUpTestSuite() {
+    reference_data8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize));
+    ASSERT_NE(reference_data8_, nullptr);
+    second_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(second_pred8_, nullptr);
+    comp_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(comp_pred8_, nullptr);
+    comp_pred8_test_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(comp_pred8_test_, nullptr);
+    reference_data16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+    ASSERT_NE(reference_data16_, nullptr);
+    second_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(second_pred16_, nullptr);
+    comp_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(comp_pred16_, nullptr);
+    comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(comp_pred16_test_, nullptr);
+  }
+
+  static void TearDownTestSuite() {
+    aom_free(reference_data8_);
+    reference_data8_ = nullptr;
+    aom_free(second_pred8_);
+    second_pred8_ = nullptr;
+    aom_free(comp_pred8_);
+    comp_pred8_ = nullptr;
+    aom_free(comp_pred8_test_);
+    comp_pred8_test_ = nullptr;
+    aom_free(reference_data16_);
+    reference_data16_ = nullptr;
+    aom_free(second_pred16_);
+    second_pred16_ = nullptr;
+    aom_free(comp_pred16_);
+    comp_pred16_ = nullptr;
+    aom_free(comp_pred16_test_);
+    comp_pred16_test_ = nullptr;
+  }
+
+ protected:
+  // Handle up to 4 128x128 blocks, with stride up to 256
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 128 * 256;
+  static const int kDataBufferSize = 4 * kDataBlockSize;
+
+  void SetUp() override {
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = AOM_BITS_8;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+      comp_pred_ = comp_pred8_;
+      comp_pred_test_ = comp_pred8_test_;
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+      comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+      comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+    }
+    mask_ = (1 << bit_depth_) - 1;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual uint8_t *GetReference(int block_idx) {
+    if (use_high_bit_depth_)
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+    return reference_data_ + block_idx * kDataBlockSize;
+  }
+
+  void ReferenceDistWtdCompAvg(int block_idx) {
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const second_pred8 = second_pred_;
+    uint8_t *const comp_pred8 = comp_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        }
+      }
+    }
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+        } else {
+          data16[h * stride + w] = fill_constant;
+        }
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+        }
+      }
+    }
+  }
+
+  void dist_wtd_comp_avg(int block_idx) {
+    const uint8_t *const reference = GetReference(block_idx);
+
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+                                          height_, reference, reference_stride_,
+                                          &jcp_param_));
+  }
+
+  void CheckCompAvg() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
+
+        ReferenceDistWtdCompAvg(0);
+        dist_wtd_comp_avg(0);
+
+        for (int y = 0; y < height_; ++y)
+          for (int x = 0; x < width_; ++x)
+            ASSERT_EQ(comp_pred_[y * width_ + x],
+                      comp_pred_test_[y * width_ + x]);
+      }
+    }
+  }
+
+  int width_, height_, mask_, bd_;
+  aom_bit_depth_t bit_depth_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
+  bool use_high_bit_depth_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
+  int reference_stride_;
+  static uint8_t *comp_pred_;
+  static uint8_t *comp_pred8_;
+  static uint16_t *comp_pred16_;
+  static uint8_t *comp_pred_test_;
+  static uint8_t *comp_pred8_test_;
+  static uint16_t *comp_pred16_test_;
+  DIST_WTD_COMP_PARAMS jcp_param_;
+
+  ACMRandom rnd_;
+};
+
 #if CONFIG_AV1_HIGHBITDEPTH
 class AV1HighBDDISTWTDCOMPAVGTest
     : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
  public:
-  ~AV1HighBDDISTWTDCOMPAVGTest() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  ~AV1HighBDDISTWTDCOMPAVGTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
   void RunCheckOutput(distwtdcompavg_func test_impl) {
@@ -430,8 +623,8 @@
 class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
     : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
  public:
-  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
   void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
diff --git a/test/comp_mask_pred_test.cc b/test/comp_mask_pred_test.cc
index 06c3192..b65730a 100644
--- a/test/comp_mask_pred_test.cc
+++ b/test/comp_mask_pred_test.cc
@@ -48,10 +48,10 @@
 
 class AV1CompMaskPredBase : public ::testing::Test {
  public:
-  ~AV1CompMaskPredBase();
-  void SetUp();
+  ~AV1CompMaskPredBase() override;
+  void SetUp() override;
 
-  void TearDown();
+  void TearDown() override;
 
  protected:
   bool CheckResult(int width, int height) {
@@ -76,7 +76,7 @@
   uint8_t *ref_;
 };
 
-AV1CompMaskPredBase::~AV1CompMaskPredBase() {}
+AV1CompMaskPredBase::~AV1CompMaskPredBase() = default;
 
 void AV1CompMaskPredBase::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -303,10 +303,10 @@
 
 class AV1CompAvgPredTest : public ::testing::TestWithParam<CompAvgPredParam> {
  public:
-  ~AV1CompAvgPredTest();
-  void SetUp();
+  ~AV1CompAvgPredTest() override;
+  void SetUp() override;
 
-  void TearDown();
+  void TearDown() override;
 
  protected:
   void RunCheckOutput(comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
@@ -333,7 +333,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompAvgPredTest);
 
-AV1CompAvgPredTest::~AV1CompAvgPredTest() {}
+AV1CompAvgPredTest::~AV1CompAvgPredTest() = default;
 
 void AV1CompAvgPredTest::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -390,7 +390,7 @@
     const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
     elapsed_time[i] = 1000.0 * time;
   }
-  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+  printf("CompAvgPred %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
          elapsed_time[1]);
   printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
 }
@@ -420,10 +420,10 @@
 #if CONFIG_AV1_HIGHBITDEPTH
 class AV1HighbdCompMaskPredTestBase : public ::testing::Test {
  public:
-  ~AV1HighbdCompMaskPredTestBase();
-  void SetUp();
+  ~AV1HighbdCompMaskPredTestBase() override;
+  void SetUp() override;
 
-  void TearDown();
+  void TearDown() override;
 
  protected:
   bool CheckResult(int width, int height) {
@@ -448,7 +448,7 @@
   uint16_t *ref_;
 };
 
-AV1HighbdCompMaskPredTestBase::~AV1HighbdCompMaskPredTestBase() {}
+AV1HighbdCompMaskPredTestBase::~AV1HighbdCompMaskPredTestBase() = default;
 
 void AV1HighbdCompMaskPredTestBase::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -494,14 +494,14 @@
     : public AV1HighbdCompMaskPredTestBase,
       public ::testing::WithParamInterface<HighbdCompMaskPredParam> {
  public:
-  ~AV1HighbdCompMaskPredTest();
+  ~AV1HighbdCompMaskPredTest() override;
 
  protected:
   void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
   void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
 };
 
-AV1HighbdCompMaskPredTest::~AV1HighbdCompMaskPredTest() {}
+AV1HighbdCompMaskPredTest::~AV1HighbdCompMaskPredTest() = default;
 
 void AV1HighbdCompMaskPredTest::RunCheckOutput(
     highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
@@ -583,6 +583,14 @@
   RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
 }
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HighbdCompMaskPredTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_neon),
+                       ::testing::ValuesIn(kCompMaskPredParams),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdCompMaskPredTest,
@@ -612,7 +620,7 @@
     : public AV1HighbdCompMaskPredTestBase,
       public ::testing::WithParamInterface<HighbdUpsampledPredParam> {
  public:
-  ~AV1HighbdUpsampledPredTest();
+  ~AV1HighbdUpsampledPredTest() override;
 
  protected:
   void RunCheckOutput(highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize);
@@ -620,7 +628,7 @@
                     int havSub);
 };
 
-AV1HighbdUpsampledPredTest::~AV1HighbdUpsampledPredTest() {}
+AV1HighbdUpsampledPredTest::~AV1HighbdUpsampledPredTest() = default;
 
 void AV1HighbdUpsampledPredTest::RunCheckOutput(
     highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize) {
@@ -712,5 +720,137 @@
                        ::testing::Range(8, 13, 2)));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HighbdUpsampledPredTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_upsampled_pred_neon),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+typedef void (*highbd_comp_avg_pred_func)(uint8_t *comp_pred,
+                                          const uint8_t *pred, int width,
+                                          int height, const uint8_t *ref,
+                                          int ref_stride);
+
+typedef std::tuple<highbd_comp_avg_pred_func, BLOCK_SIZE, int>
+    HighbdCompAvgPredParam;
+
+class AV1HighbdCompAvgPredTest
+    : public ::testing::TestWithParam<HighbdCompAvgPredParam> {
+ public:
+  ~AV1HighbdCompAvgPredTest() override;
+  void SetUp() override;
+
+ protected:
+  void RunCheckOutput(highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+  void RunSpeedTest(highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) const {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, x, y);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint16_t *comp_pred1_;
+  uint16_t *comp_pred2_;
+  uint16_t *pred_;
+  uint16_t *ref_;
+};
+
+AV1HighbdCompAvgPredTest::~AV1HighbdCompAvgPredTest() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_);
+}
+
+void AV1HighbdCompAvgPredTest::SetUp() {
+  int bd_ = GET_PARAM(2);
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+
+  comp_pred1_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+  ASSERT_NE(comp_pred1_, nullptr);
+  comp_pred2_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+  ASSERT_NE(comp_pred2_, nullptr);
+  pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+  ASSERT_NE(pred_, nullptr);
+  ref_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*ref_));
+  ASSERT_NE(ref_, nullptr);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    ref_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+}
+
+void AV1HighbdCompAvgPredTest::RunCheckOutput(
+    highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(comp_pred1_),
+                             CONVERT_TO_BYTEPTR(pred_), w, h,
+                             CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+  test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+            CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+
+  ASSERT_EQ(CheckResult(w, h), true);
+}
+
+void AV1HighbdCompAvgPredTest::RunSpeedTest(highbd_comp_avg_pred_func test_impl,
+                                            BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int num_loops = 1000000000 / (w + h);
+
+  highbd_comp_avg_pred_func functions[2] = { aom_highbd_comp_avg_pred_c,
+                                             test_impl };
+  double elapsed_time[2] = { 0.0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    highbd_comp_avg_pred_func func = functions[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+           CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time;
+  }
+  printf("HighbdCompAvg %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdCompAvgPredTest);
+
+TEST_P(AV1HighbdCompAvgPredTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdCompAvgPredTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HighbdCompAvgPredTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_avg_pred_neon),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace
diff --git a/test/convolve_round_test.cc b/test/convolve_round_test.cc
deleted file mode 100644
index 0580744..0000000
--- a/test/convolve_round_test.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/aom_timer.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-#define CONVOLVE_ROUNDING_PARAM                                            \
-  const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \
-      int h, int bits
-
-typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM);
-
-typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd);
-
-template <ConvolveRoundFuncHbd fn>
-void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) {
-  const int bd = 8;
-  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
-}
-
-template <ConvolveRoundFuncHbd fn>
-void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) {
-  const int bd = 10;
-  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
-}
-
-template <ConvolveRoundFuncHbd fn>
-void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
-  const int bd = 12;
-  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
-}
-
-typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
-
-using std::tuple;
-
-typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
-    ConvolveRoundParam;
-
-const int kTestNum = 5000;
-
-class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
- protected:
-  ConvolveRoundTest()
-      : func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) {
-  }
-  virtual ~ConvolveRoundTest() {}
-
-  virtual void SetUp() {
-    const size_t block_size = 128 * 128;
-    src_ = reinterpret_cast<int32_t *>(
-        aom_memalign(16, block_size * sizeof(*src_)));
-    ASSERT_NE(src_, nullptr);
-    dst_ref_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(16, block_size * sizeof(*dst_ref_)));
-    ASSERT_NE(dst_ref_, nullptr);
-    dst_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(16, block_size * sizeof(*dst_)));
-    ASSERT_NE(dst_, nullptr);
-  }
-
-  virtual void TearDown() {
-    aom_free(src_);
-    aom_free(dst_ref_);
-    aom_free(dst_);
-  }
-
-  void ConvolveRoundingRun() {
-    int test_num = 0;
-    const int src_stride = 128;
-    const int dst_stride = 128;
-    int bits = 13;
-    uint8_t *dst = 0;
-    uint8_t *dst_ref = 0;
-
-    if (data_path_ == LOWBITDEPTH_TEST) {
-      dst = reinterpret_cast<uint8_t *>(dst_);
-      dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
-    } else if (data_path_ == HIGHBITDEPTH_TEST) {
-      dst = CONVERT_TO_BYTEPTR(dst_);
-      dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
-    } else {
-      assert(0);
-    }
-
-    while (test_num < kTestNum) {
-      int block_size = test_num % BLOCK_SIZES_ALL;
-      int w = block_size_wide[block_size];
-      int h = block_size_high[block_size];
-
-      if (test_num % 2 == 0)
-        bits -= 1;
-      else
-        bits += 1;
-
-      GenerateBufferWithRandom(src_, src_stride, bits, w, h);
-
-      func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
-      API_REGISTER_STATE_CHECK(
-          func_(src_, src_stride, dst, dst_stride, w, h, bits));
-
-      if (data_path_ == LOWBITDEPTH_TEST) {
-        for (int r = 0; r < h; ++r) {
-          for (int c = 0; c < w; ++c) {
-            ASSERT_EQ(dst_ref[r * dst_stride + c], dst[r * dst_stride + c])
-                << "Mismatch at r: " << r << " c: " << c << " w: " << w
-                << " h: " << h << " test: " << test_num;
-          }
-        }
-      } else {
-        for (int r = 0; r < h; ++r) {
-          for (int c = 0; c < w; ++c) {
-            ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
-                << "Mismatch at r: " << r << " c: " << c << " w: " << w
-                << " h: " << h << " test: " << test_num;
-          }
-        }
-      }
-
-      test_num++;
-    }
-  }
-
-  void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w,
-                                int h) {
-    int32_t number;
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        number = static_cast<int32_t>(rand_.Rand31());
-        number %= 1 << (bits + 9);
-        src[r * src_stride + c] = number;
-      }
-    }
-  }
-
-  ACMRandom rand_;
-  int32_t *src_;
-  uint16_t *dst_ref_;
-  uint16_t *dst_;
-
-  ConvolveRoundFunc func_ref_;
-  ConvolveRoundFunc func_;
-  DataPathType data_path_;
-};
-
-TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
-
-using std::make_tuple;
-#if HAVE_AVX2
-const ConvolveRoundParam kConvRndParamArray[] = {
-  make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
-             LOWBITDEPTH_TEST),
-  make_tuple(&highbd_convolve_rounding_8<av1_highbd_convolve_rounding_c>,
-             &highbd_convolve_rounding_8<av1_highbd_convolve_rounding_avx2>,
-             HIGHBITDEPTH_TEST),
-  make_tuple(&highbd_convolve_rounding_10<av1_highbd_convolve_rounding_c>,
-             &highbd_convolve_rounding_10<av1_highbd_convolve_rounding_avx2>,
-             HIGHBITDEPTH_TEST),
-  make_tuple(&highbd_convolve_rounding_12<av1_highbd_convolve_rounding_c>,
-             &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
-             HIGHBITDEPTH_TEST)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveRoundTest,
-                         ::testing::ValuesIn(kConvRndParamArray));
-#endif  // HAVE_AVX2
-}  // namespace
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 8aed171..c97f814 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -301,8 +301,6 @@
     ASSERT_NE(output16_ref_, nullptr);
   }
 
-  virtual void TearDown() {}
-
   static void TearDownTestSuite() {
     aom_free(input_ - 1);
     input_ = nullptr;
@@ -345,7 +343,7 @@
             i % kOuterBlockSize >= (BorderLeft() + Width()));
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     UUT_ = GET_PARAM(2);
     if (UUT_->use_highbd_ != 0)
       mask_ = (1 << UUT_->use_highbd_) - 1;
@@ -764,6 +762,17 @@
 WRAP(convolve8_horiz_avx2, 12)
 WRAP(convolve8_vert_avx2, 12)
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+WRAP(convolve8_horiz_neon, 8)
+WRAP(convolve8_vert_neon, 8)
+
+WRAP(convolve8_horiz_neon, 10)
+WRAP(convolve8_vert_neon, 10)
+
+WRAP(convolve8_horiz_neon, 12)
+WRAP(convolve8_vert_neon, 12)
+#endif  // HAVE_NEON
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #undef WRAP
@@ -866,6 +875,21 @@
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_neon(wrap_convolve8_horiz_neon_8,
+                                            wrap_convolve8_vert_neon_8, 8);
+const ConvolveFunctions wrap_convolve10_neon(wrap_convolve8_horiz_neon_10,
+                                             wrap_convolve8_vert_neon_10, 10);
+const ConvolveFunctions wrap_convolve12_neon(wrap_convolve8_horiz_neon_12,
+                                             wrap_convolve8_vert_neon_12, 12);
+const ConvolveParam kArray_HighbdConvolve8_neon[] = {
+  ALL_SIZES_64(wrap_convolve8_neon), ALL_SIZES_64(wrap_convolve10_neon),
+  ALL_SIZES_64(wrap_convolve12_neon)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdConvolveTest,
+                         ::testing::ValuesIn(kArray_HighbdConvolve8_neon));
+#endif
 const ConvolveFunctions convolve8_neon(aom_convolve8_horiz_neon,
                                        aom_convolve8_vert_neon, 0);
 const ConvolveParam kArray_Convolve8_neon[] = { ALL_SIZES(convolve8_neon) };
@@ -874,4 +898,25 @@
                          ::testing::ValuesIn(kArray_Convolve8_neon));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(aom_convolve8_horiz_neon_dotprod,
+                                               aom_convolve8_vert_neon_dotprod,
+                                               0);
+const ConvolveParam kArray_Convolve8_neon_dotprod[] = { ALL_SIZES(
+    convolve8_neon_dotprod) };
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, LowbdConvolveTest,
+                         ::testing::ValuesIn(kArray_Convolve8_neon_dotprod));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(aom_convolve8_horiz_neon_i8mm,
+                                            aom_convolve8_vert_neon_i8mm, 0);
+const ConvolveParam kArray_Convolve8_neon_i8mm[] = { ALL_SIZES(
+    convolve8_neon_i8mm) };
+
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, LowbdConvolveTest,
+                         ::testing::ValuesIn(kArray_Convolve8_neon_i8mm));
+#endif  // HAVE_NEON_I8MM
+
 }  // namespace
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index 93ca8ec..9733732 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc
@@ -37,10 +37,8 @@
 
 class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
  public:
-  virtual ~AV1CornerMatchTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
+  ~AV1CornerMatchTest() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(int run_times);
@@ -50,12 +48,11 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CornerMatchTest);
 
-AV1CornerMatchTest::~AV1CornerMatchTest() {}
+AV1CornerMatchTest::~AV1CornerMatchTest() = default;
 void AV1CornerMatchTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
   target_func = GET_PARAM(1);
 }
-void AV1CornerMatchTest::TearDown() {}
 
 void AV1CornerMatchTest::RunCheckOutput(int run_times) {
   const int w = 128, h = 128;
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 5396bec..b5f5d29 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -28,19 +28,19 @@
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
         tune_content_(AOM_CONTENT_DEFAULT) {}
-  virtual ~CpuSpeedTest() {}
+  ~CpuSpeedTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     if (encoding_mode_ != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 25;
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+  void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
@@ -52,7 +52,7 @@
     }
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/cpu_used_firstpass_test.cc b/test/cpu_used_firstpass_test.cc
index cfffcd7a..53db8b0 100644
--- a/test/cpu_used_firstpass_test.cc
+++ b/test/cpu_used_firstpass_test.cc
@@ -27,9 +27,9 @@
  protected:
   CpuUsedFirstpassTest()
       : EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(2)) {}
-  virtual ~CpuUsedFirstpassTest() {}
+  ~CpuUsedFirstpassTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -40,7 +40,7 @@
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual void BeginPassHook(unsigned int pass) {
+  void BeginPassHook(unsigned int pass) override {
     psnr_ = 0.0;
     nframes_ = 0;
 
@@ -50,13 +50,13 @@
       cpu_used_ = second_pass_cpu_used_;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 21b40d9..a75a72f 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -36,9 +36,9 @@
   }
 
  protected:
-  virtual ~DatarateTestLarge() {}
+  ~DatarateTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
@@ -298,6 +298,72 @@
         << " The datarate for the file missed the target!"
         << cfg_.rc_target_bitrate << " " << effective_datarate_;
   }
+
+  virtual void BasicRateTargetingSuperresCBR() {
+    ::libaom_test::I420VideoSource video("desktopqvga2.320_240.yuv", 320, 240,
+                                         30, 1, 0, 800);
+
+    cfg_.g_profile = 0;
+    cfg_.g_timebase = video.timebase();
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+
+    cfg_.rc_superres_mode = AOM_SUPERRES_FIXED;
+    cfg_.rc_superres_denominator = 16;
+    cfg_.rc_superres_kf_denominator = 16;
+
+    const int bitrate_array[2] = { 250, 650 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " " << effective_datarate_;
+  }
+
+  virtual void BasicRateTargetingSuperresCBRMultiThreads() {
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+
+    cfg_.g_profile = 0;
+    cfg_.g_timebase = video.timebase();
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_threads = 2;
+
+    cfg_.rc_superres_mode = AOM_SUPERRES_FIXED;
+    cfg_.rc_superres_denominator = 16;
+    cfg_.rc_superres_kf_denominator = 16;
+
+    const int bitrate_array[2] = { 250, 650 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    tile_column_ = 1;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " " << effective_datarate_;
+  }
 };
 
 // Params: test mode, speed, aq mode.
@@ -312,9 +378,9 @@
   }
 
  protected:
-  virtual ~DatarateTestFrameDropLarge() {}
+  ~DatarateTestFrameDropLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
@@ -405,6 +471,16 @@
   BasicRateTargeting444CBRScreenTest();
 }
 
+// Check basic rate targeting for Superres mode with CBR.
+TEST_P(DatarateTestLarge, BasicRateTargetingSuperresCBR) {
+  BasicRateTargetingSuperresCBR();
+}
+
+// Check basic rate targeting for Superres mode with CBR and multi-threads.
+TEST_P(DatarateTestLarge, BasicRateTargetingSuperresCBRMultiThreads) {
+  BasicRateTargetingSuperresCBRMultiThreads();
+}
+
 // Check that (1) the first dropped frame gets earlier and earlier
 // as the drop frame threshold is increased, and (2) that the total number of
 // frame drops does not decrease as we increase frame drop threshold.
@@ -433,9 +509,9 @@
   }
 
  protected:
-  virtual ~DatarateTestSpeedChangeRealtime() {}
+  ~DatarateTestSpeedChangeRealtime() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
@@ -521,6 +597,16 @@
   BasicRateTargeting444CBRScreenTest();
 }
 
+// Check basic rate targeting for Superres mode with CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargetingSuperresCBR) {
+  BasicRateTargetingSuperresCBR();
+}
+
+// Check basic rate targeting for Superres mode with CBR and multi-threads.
+TEST_P(DatarateTestRealtime, BasicRateTargetingSuperresCBRMultiThreads) {
+  BasicRateTargetingSuperresCBRMultiThreads();
+}
+
 // Check that (1) the first dropped frame gets earlier and earlier
 // as the drop frame threshold is increased, and (2) that the total number of
 // frame drops does not decrease as we increase frame drop threshold.
@@ -540,15 +626,15 @@
   DatarateTestSetFrameQpRealtime() : DatarateTest(GetParam()), frame_(0) {}
 
  protected:
-  virtual ~DatarateTestSetFrameQpRealtime() {}
+  ~DatarateTestSetFrameQpRealtime() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(libaom_test::kRealTime);
     ResetModel();
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     set_cpu_used_ = 7;
     DatarateTest::PreEncodeFrameHook(video, encoder);
     frame_qp_ = rnd_.PseudoUniform(63);
@@ -556,7 +642,7 @@
     frame_++;
   }
 
-  virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
     if (frame_ >= total_frames_) return;
     int qp = 0;
     encoder->Control(AOME_GET_LAST_QUANTIZER_64, &qp);
diff --git a/test/datarate_test.h b/test/datarate_test.h
index 4b74c65..accc1ad 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -28,7 +28,7 @@
         speed_change_test_(false) {}
 
  protected:
-  virtual ~DatarateTest() {}
+  ~DatarateTest() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -57,8 +57,8 @@
     }
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
@@ -122,7 +122,7 @@
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -176,7 +176,7 @@
     }
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     duration_ = (last_pts_ + 1) * timebase_;
     // Effective file datarate:
     effective_datarate_ = (bits_total_ / 1000.0) / duration_;
diff --git a/test/decode_multithreaded_test.cc b/test/decode_multithreaded_test.cc
index 5a13f75..4e06f1a 100644
--- a/test/decode_multithreaded_test.cc
+++ b/test/decode_multithreaded_test.cc
@@ -63,16 +63,16 @@
     }
   }
 
-  virtual ~AV1DecodeMultiThreadedTest() {
+  ~AV1DecodeMultiThreadedTest() override {
     delete single_thread_dec_;
     for (int i = 0; i < kNumMultiThreadDecoders; ++i)
       delete multi_thread_dec_[i];
   }
 
-  virtual void SetUp() { InitializeConfig(libaom_test::kTwoPassGood); }
+  void SetUp() override { InitializeConfig(libaom_test::kTwoPassGood); }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
       encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
@@ -93,7 +93,7 @@
     md5->Add(img);
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_);
 
     for (int i = 0; i < kNumMultiThreadDecoders; ++i)
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 900cb67..0300354 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -101,13 +101,12 @@
  protected:
   AV1NewEncodeDecodePerfTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
-        outfile_(0), out_frames_(0) {}
+        outfile_(nullptr), out_frames_(0) {}
 
-  virtual ~AV1NewEncodeDecodePerfTest() {}
+  ~AV1NewEncodeDecodePerfTest() override = default;
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+  void SetUp() override {
+    InitializeConfig(encoding_mode_);
 
     cfg_.g_lag_in_frames = 25;
     cfg_.rc_min_quantizer = 2;
@@ -121,8 +120,8 @@
     cfg_.rc_end_usage = AOM_VBR;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, speed_);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
@@ -130,7 +129,7 @@
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     const char *const env = getenv("LIBAOM_TEST_DATA_PATH");
     const std::string data_path(env ? env : ".");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
@@ -138,7 +137,7 @@
     ASSERT_NE(outfile_, nullptr);
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (outfile_ != nullptr) {
       if (!fseek(outfile_, 0, SEEK_SET))
         ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
@@ -147,7 +146,7 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -160,7 +159,7 @@
               pkt->data.frame.sz);
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   void set_speed(unsigned int speed) { speed_ = speed; }
 
diff --git a/test/decode_scalability_test.cc b/test/decode_scalability_test.cc
index c04d58b..d66c8ec 100644
--- a/test/decode_scalability_test.cc
+++ b/test/decode_scalability_test.cc
@@ -43,7 +43,7 @@
       : DecoderTest(GET_PARAM(0)), headers_(GET_PARAM(1).headers),
         num_headers_(GET_PARAM(1).num_headers) {}
 
-  ~DecodeScalabilityTest() override {}
+  ~DecodeScalabilityTest() override = default;
 
   void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
                           libaom_test::Decoder *decoder) override {
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 9678f72..311898e 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -153,7 +153,7 @@
   explicit DecoderTest(const CodecFactory *codec)
       : codec_(codec), cfg_(), flags_(0) {}
 
-  virtual ~DecoderTest() {}
+  virtual ~DecoderTest() = default;
 
   const CodecFactory *codec_;
   aom_codec_dec_cfg_t cfg_;
diff --git a/test/disflow_test.cc b/test/disflow_test.cc
new file mode 100644
index 0000000..124c9a9
--- /dev/null
+++ b/test/disflow_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using ComputeFlowAtPointFunc = void (*)(const uint8_t *src, const uint8_t *ref,
+                                        int x, int y, int width, int height,
+                                        int stride, double *u, double *v);
+
+class ComputeFlowTest
+    : public ::testing::TestWithParam<ComputeFlowAtPointFunc> {
+ public:
+  ComputeFlowTest()
+      : target_func_(GetParam()),
+        rnd_(libaom_test::ACMRandom::DeterministicSeed()) {}
+
+ protected:
+  void RunCheckOutput(int run_times);
+  ComputeFlowAtPointFunc target_func_;
+
+  libaom_test::ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ComputeFlowTest);
+
+void ComputeFlowTest::RunCheckOutput(int run_times) {
+  constexpr int kWidth = 352;
+  constexpr int kHeight = 288;
+
+  ::libaom_test::YUVVideoSource video("bus_352x288_420_f20_b8.yuv",
+                                      AOM_IMG_FMT_I420, kWidth, kHeight, 30, 1,
+                                      0, 2);
+  // Use Y (Luminance) plane.
+  video.Begin();
+  uint8_t *src = video.img()->planes[0];
+  ASSERT_NE(src, nullptr);
+  video.Next();
+  uint8_t *ref = video.img()->planes[0];
+  ASSERT_NE(ref, nullptr);
+
+  // Pick a random value between -5 and 5. The range was chosen arbitrarily as
+  // u and v can take any kind of value in practise, but it shouldn't change the
+  // outcome of the tests.
+  const double u_rand = (static_cast<double>(rnd_.Rand8()) / 255) * 10 - 5;
+  double u_ref = u_rand;
+  double u_test = u_rand;
+
+  const double v_rand = (static_cast<double>(rnd_.Rand8()) / 255) * 10 - 5;
+  double v_ref = v_rand;
+  double v_test = v_rand;
+
+  // Pick a random point in the frame. If the frame is 352x288, that means we
+  // can call the function on all values of x comprised between 8 and 344, and
+  // all values of y comprised between 8 and 280.
+  const int x = rnd_((kWidth - 8) - 8 + 1) + 8;
+  const int y = rnd_((kHeight - 8) - 8 + 1) + 8;
+
+  aom_usec_timer ref_timer, test_timer;
+
+  aom_compute_flow_at_point_c(src, ref, x, y, kWidth, kHeight, kWidth, &u_ref,
+                              &v_ref);
+
+  target_func_(src, ref, x, y, kWidth, kHeight, kWidth, &u_test, &v_test);
+
+  if (run_times > 1) {
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < run_times; ++i) {
+      aom_compute_flow_at_point_c(src, ref, x, y, kWidth, kHeight, kWidth,
+                                  &u_ref, &v_ref);
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const double elapsed_time_c =
+        static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+    aom_usec_timer_start(&test_timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(src, ref, x, y, kWidth, kHeight, kWidth, &u_test, &v_test);
+    }
+    aom_usec_timer_mark(&test_timer);
+    const double elapsed_time_simd =
+        static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+    printf("c_time=%fns \t simd_time=%fns \t speedup=%.2f\n", elapsed_time_c,
+           elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
+  } else {
+    ASSERT_EQ(u_ref, u_test);
+    ASSERT_EQ(v_ref, v_test);
+  }
+}
+
+TEST_P(ComputeFlowTest, CheckOutput) { RunCheckOutput(1); }
+
+TEST_P(ComputeFlowTest, DISABLED_Speed) { RunCheckOutput(10000000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ComputeFlowTest,
+                         ::testing::Values(aom_compute_flow_at_point_sse4_1));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest,
+                         ::testing::Values(aom_compute_flow_at_point_neon));
+#endif
+
+}  // namespace
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index cbb3153..3135d2a 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -178,7 +178,7 @@
     }
   }
 
-  virtual ~DrPredTest() {}
+  ~DrPredTest() override = default;
 
   void Predict(bool speedtest, int tx) {
     const int kNumTests = speedtest ? kMaxNumTests : 1;
diff --git a/test/dropframe_encode_test.cc b/test/dropframe_encode_test.cc
index c7a801b..4a54c0b 100644
--- a/test/dropframe_encode_test.cc
+++ b/test/dropframe_encode_test.cc
@@ -25,10 +25,10 @@
   DropFrameEncodeTestLarge()
       : EncoderTest(GET_PARAM(0)), frame_number_(0), threads_(GET_PARAM(2)) {}
 
-  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+  void SetUp() override { InitializeConfig(GET_PARAM(1)); }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     frame_number_ = video->frame();
     if (frame_number_ == 0) {
       encoder->Control(AOME_SET_CPUUSED, 1);
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 2a8c072..e6ef4c2 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -24,9 +24,9 @@
 namespace {
 
 #if CONFIG_REALTIME_ONLY
-const int kUsage = AOM_USAGE_REALTIME;
+const unsigned int kUsage = AOM_USAGE_REALTIME;
 #else
-const int kUsage = AOM_USAGE_GOOD_QUALITY;
+const unsigned int kUsage = AOM_USAGE_GOOD_QUALITY;
 #endif
 
 static void *Memset16(void *dest, int val, size_t length) {
@@ -66,6 +66,22 @@
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
             aom_codec_enc_config_default(iface, &cfg, 3));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_w = 1 << 16;
+  cfg.g_h = (1 << 14) + 1;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_w = (1 << 14) + 1;
+  cfg.g_h = 1 << 16;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_forced_max_frame_width = 1 << 16;
+  cfg.g_forced_max_frame_height = (1 << 14) + 1;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_forced_max_frame_width = (1 << 14) + 1;
+  cfg.g_forced_max_frame_height = 1 << 16;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(nullptr, aom_codec_get_global_headers(nullptr));
 
@@ -90,13 +106,12 @@
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
-TEST(EncodeAPI, SetSFrameOnFirstFrame) {
+void EncodeSetSFrameOnFirstFrame(aom_img_fmt fmt, aom_codec_flags_t flag) {
   constexpr int kWidth = 2;
   constexpr int kHeight = 128;
   unsigned char kBuffer[kWidth * kHeight * 3] = { 0 };
   aom_image_t img;
-  ASSERT_EQ(aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1, kBuffer),
-            &img);
+  ASSERT_EQ(aom_img_wrap(&img, fmt, kWidth, kHeight, 1, kBuffer), &img);
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
@@ -105,15 +120,25 @@
   cfg.g_h = kHeight;
 
   aom_codec_ctx_t enc;
-  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, flag), AOM_CODEC_OK);
   // One of these aom_codec_encode() calls should fail.
   if (aom_codec_encode(&enc, &img, 0, 1, AOM_EFLAG_SET_S_FRAME) ==
       AOM_CODEC_OK) {
-    EXPECT_NE(aom_codec_encode(&enc, NULL, 0, 0, 0), AOM_CODEC_OK);
+    EXPECT_NE(aom_codec_encode(&enc, nullptr, 0, 0, 0), AOM_CODEC_OK);
   }
   EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
 }
 
+TEST(EncodeAPI, SetSFrameOnFirstFrame) {
+  EncodeSetSFrameOnFirstFrame(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(EncodeAPI, SetSFrameOnFirstFrameHighbd) {
+  EncodeSetSFrameOnFirstFrame(AOM_IMG_FMT_I42016, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 TEST(EncodeAPI, MonochromeInProfiles) {
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
@@ -147,7 +172,7 @@
   ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
 
   aom_image_t *image =
-      aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
+      aom_img_alloc(nullptr, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
   ASSERT_NE(image, nullptr);
 
   // Set the image to two colors so that av1_set_screen_content_options() will
@@ -184,7 +209,7 @@
   ASSERT_EQ(init_status, AOM_CODEC_OK);
 
   aom_image_t *image =
-      aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+      aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
   ASSERT_NE(image, nullptr);
 
   // Set the image to two colors so that av1_set_screen_content_options() will
@@ -222,7 +247,7 @@
   ASSERT_EQ(init_status, AOM_CODEC_OK);
 
   aom_image_t *image =
-      aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
+      aom_img_alloc(nullptr, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
   ASSERT_NE(image, nullptr);
 
   // Set the image to two colors so that av1_set_screen_content_options() will
@@ -255,7 +280,7 @@
   ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
 
   aom_image_t *image =
-      aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+      aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
   ASSERT_NE(image, nullptr);
 
   // Set the image to two colors so that av1_set_screen_content_options() will
@@ -279,13 +304,13 @@
 }
 
 class EncodeAPIParameterized
-    : public testing::TestWithParam<
-          std::tuple</*usage=*/int, /*speed=*/int, /*aq_mode=*/int>> {};
+    : public testing::TestWithParam<std::tuple<
+          /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {};
 
 // Encodes two frames at a given usage, speed, and aq_mode setting.
 // Reproduces b/303023614
 TEST_P(EncodeAPIParameterized, HighBDEncoderHighBDFrames) {
-  const int usage = std::get<0>(GetParam());
+  const unsigned int usage = std::get<0>(GetParam());
   int speed = std::get<1>(GetParam());
 
   if (speed == 10 && usage != AOM_USAGE_REALTIME) {
@@ -304,15 +329,15 @@
 #if !CONFIG_AV1_HIGHBITDEPTH
   ASSERT_EQ(init_status, AOM_CODEC_INCAPABLE);
 #else
-  const int aq_mode = std::get<2>(GetParam());
-
   ASSERT_EQ(init_status, AOM_CODEC_OK);
 
+  const unsigned int aq_mode = std::get<2>(GetParam());
+
   ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, speed), AOM_CODEC_OK);
   ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_AQ_MODE, aq_mode), AOM_CODEC_OK);
 
   aom_image_t *image =
-      aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+      aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
   ASSERT_NE(image, nullptr);
 
   for (unsigned int i = 0; i < image->d_h; ++i) {
@@ -338,7 +363,7 @@
 #endif
 }
 
-const int kUsages[] = {
+const unsigned int kUsages[] = {
   AOM_USAGE_REALTIME,
 #if !CONFIG_REALTIME_ONLY
   AOM_USAGE_GOOD_QUALITY,
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index b626acd..b52cf33 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -63,11 +63,10 @@
       : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
 
-  virtual ~AV1EncodePerfTest() {}
+  ~AV1EncodePerfTest() override = default;
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+  void SetUp() override {
+    InitializeConfig(encoding_mode_);
 
     cfg_.g_lag_in_frames = 0;
     cfg_.rc_min_quantizer = 2;
@@ -83,8 +82,8 @@
     cfg_.g_threads = threads_;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       const int log2_tile_columns = 3;
       encoder->Control(AOME_SET_CPUUSED, speed_);
@@ -94,19 +93,19 @@
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     min_psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) {
       min_psnr_ = pkt->data.psnr.psnr[0];
     }
   }
 
   // for performance reasons don't decode
-  virtual bool DoDecode() { return 0; }
+  bool DoDecode() const override { return false; }
 
   double min_psnr() const { return min_psnr_; }
 
diff --git a/test/encode_small_width_height_test.cc b/test/encode_small_width_height_test.cc
index 3d00327..22f6939 100644
--- a/test/encode_small_width_height_test.cc
+++ b/test/encode_small_width_height_test.cc
@@ -25,21 +25,21 @@
 namespace {
 
 // Dummy buffer of zero samples.
-constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 };
+constexpr unsigned char kBuffer[2 * (256 * 512 + 2 * 128 * 256)] = { 0 };
 #if CONFIG_REALTIME_ONLY
 const int kUsage = 1;
 #else
 const int kUsage = 0;
 #endif
 
-TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+void EncodeSmallWidthMultiThreaded(aom_img_fmt fmt, aom_codec_flags_t flag) {
   // The image has only one tile and the tile is two AV1 superblocks wide.
   // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
   constexpr int kWidth = 128;
   constexpr int kHeight = 512;
 
   aom_image_t img;
-  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+  EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
                                const_cast<unsigned char *>(kBuffer)));
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -49,22 +49,33 @@
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
   aom_codec_ctx_t enc;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+  EncodeSmallWidthMultiThreaded(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+  EncodeSmallWidthMultiThreaded(AOM_IMG_FMT_I42016, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 #if !CONFIG_REALTIME_ONLY
-TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+void EncodeSmallWidthMultiThreadedSpeed0(aom_img_fmt fmt,
+                                         aom_codec_flags_t flag) {
   // The image has only one tile and the tile is two AV1 superblocks wide.
   // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
   constexpr int kWidth = 256;
   constexpr int kHeight = 512;
 
   aom_image_t img;
-  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+  EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
                                const_cast<unsigned char *>(kBuffer)));
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -74,22 +85,34 @@
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
   aom_codec_ctx_t enc;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+  EncodeSmallWidthMultiThreadedSpeed0(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+  EncodeSmallWidthMultiThreadedSpeed0(AOM_IMG_FMT_I42016,
+                                      AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 #endif
 
-TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+void EncodeSmallHeightMultiThreaded(aom_img_fmt fmt, aom_codec_flags_t flag) {
   // The image has only one tile and the tile is one AV1 superblock tall.
   // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
   constexpr int kWidth = 512;
   constexpr int kHeight = 64;
 
   aom_image_t img;
-  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+  EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
                                const_cast<unsigned char *>(kBuffer)));
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -99,22 +122,34 @@
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
   aom_codec_ctx_t enc;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+  EncodeSmallHeightMultiThreaded(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+  EncodeSmallHeightMultiThreaded(AOM_IMG_FMT_I42016,
+                                 AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 #if !CONFIG_REALTIME_ONLY
-TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+void EncodeSmallHeightMultiThreadedSpeed0(aom_img_fmt fmt,
+                                          aom_codec_flags_t flag) {
   // The image has only one tile and the tile is one AV1 superblock tall.
   // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
   constexpr int kWidth = 512;
   constexpr int kHeight = 128;
 
   aom_image_t img;
-  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+  EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
                                const_cast<unsigned char *>(kBuffer)));
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
@@ -124,17 +159,28 @@
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
   aom_codec_ctx_t enc;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+  EncodeSmallHeightMultiThreadedSpeed0(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+  EncodeSmallHeightMultiThreadedSpeed0(AOM_IMG_FMT_I42016,
+                                       AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif
 
 // A reproducer test for aomedia:3113. The test should complete without any
 // memory errors.
-TEST(EncodeSmallWidthHeight, 1x1) {
+void Encode1x1(aom_img_fmt fmt, int bitdepth, aom_codec_flags_t flags) {
   constexpr int kWidth = 1;
   constexpr int kHeight = 1;
 
@@ -144,8 +190,8 @@
   // set up img manually.
   aom_image_t img;
   memset(&img, 0, sizeof(img));
-  img.fmt = AOM_IMG_FMT_I420;
-  img.bit_depth = 8;
+  img.fmt = fmt;
+  img.bit_depth = bitdepth;
   img.w = kWidth;
   img.h = kHeight;
   img.d_w = kWidth;
@@ -153,10 +199,14 @@
   img.x_chroma_shift = 1;
   img.y_chroma_shift = 1;
   img.bps = 12;
-  int y_stride = kWidth;
-  int uv_stride = (kWidth + 1) >> 1;
+  const int y_stride = kWidth;
+  const int uv_stride = (kWidth + 1) >> 1;
   int y_height = kHeight;
   int uv_height = (kHeight + 1) >> 1;
+  if (bitdepth > 8) {
+    y_height <<= 1;
+    uv_height <<= 1;
+  }
   img.stride[AOM_PLANE_Y] = y_stride;
   img.stride[AOM_PLANE_U] = img.stride[AOM_PLANE_V] = uv_stride;
   std::unique_ptr<unsigned char[]> y_plane(
@@ -178,11 +228,19 @@
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
   aom_codec_ctx_t enc;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flags));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+TEST(EncodeSmallWidthHeight, 1x1) { Encode1x1(AOM_IMG_FMT_I420, 8, 0); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, 1x1) {
+  Encode1x1(AOM_IMG_FMT_I42016, 12, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 }  // namespace
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 80be8ed..d1e6615 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -199,7 +199,7 @@
     cfg_.g_threads = 1;
   }
 
-  virtual ~EncoderTest() {}
+  virtual ~EncoderTest() = default;
 
   // Initialize the cfg_ member with the default configuration for the
   // TestMode enum and maps the TestMode enum to the passes_ variable.
diff --git a/test/encodemb_test.cc b/test/encodemb_test.cc
index 4c725c7..6165fc3 100644
--- a/test/encodemb_test.cc
+++ b/test/encodemb_test.cc
@@ -82,7 +82,7 @@
   while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob;
   EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]);
 
-  // Check qqcoeff is still valid.
+  // Check dqcoeff is still valid.
   for (int i = 0; i < max_eob; ++i) {
     EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]);
   }
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 0a58737..49b0fba 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -42,9 +42,9 @@
  public:
   EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
 
-  virtual ~EncodeTxbTest() {}
+  ~EncodeTxbTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
         aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
     ASSERT_NE(coeff_contexts_ref_, nullptr);
@@ -53,7 +53,7 @@
     ASSERT_NE(coeff_contexts_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(coeff_contexts_ref_);
     aom_free(coeff_contexts_);
   }
@@ -211,8 +211,7 @@
 class EncodeTxbInitLevelTest
     : public ::testing::TestWithParam<TxbInitLevelParam> {
  public:
-  virtual ~EncodeTxbInitLevelTest() {}
-  virtual void TearDown() {}
+  ~EncodeTxbInitLevelTest() override = default;
   void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbInitLevelTest);
diff --git a/test/end_to_end_psnr_test.cc b/test/end_to_end_psnr_test.cc
index 0396438..687308d 100644
--- a/test/end_to_end_psnr_test.cc
+++ b/test/end_to_end_psnr_test.cc
@@ -86,9 +86,9 @@
         cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  virtual ~EndToEndTest() {}
+  ~EndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     if (encoding_mode_ == ::libaom_test::kOnePassGood ||
         encoding_mode_ == ::libaom_test::kTwoPassGood) {
@@ -100,18 +100,18 @@
     }
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
diff --git a/test/end_to_end_qmpsnr_test.cc b/test/end_to_end_qmpsnr_test.cc
index de183ad..7a755a7 100644
--- a/test/end_to_end_qmpsnr_test.cc
+++ b/test/end_to_end_qmpsnr_test.cc
@@ -69,7 +69,7 @@
         test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
         ssim_(0.0) {}
 
-  ~EndToEndQMPSNRTest() override {}
+  ~EndToEndQMPSNRTest() override = default;
 
   void SetUp() override { InitializeConfig(encoding_mode_); }
 
diff --git a/test/end_to_end_ssim_test.cc b/test/end_to_end_ssim_test.cc
index 2e40c94..f1b0cae 100644
--- a/test/end_to_end_ssim_test.cc
+++ b/test/end_to_end_ssim_test.cc
@@ -66,7 +66,7 @@
         test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
         ssim_(0.0) {}
 
-  ~EndToEndSSIMTest() override {}
+  ~EndToEndSSIMTest() override = default;
 
   void SetUp() override { InitializeConfig(encoding_mode_); }
 
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index aadbb44..176efdf 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -65,15 +65,13 @@
 
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
  public:
-  virtual ~ErrorBlockTest() {}
-  virtual void SetUp() {
+  ~ErrorBlockTest() override = default;
+  void SetUp() override {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() {}
-
  protected:
   aom_bit_depth_t bit_depth_;
   ErrorBlockFunc error_block_op_;
@@ -289,6 +287,14 @@
 
 #if (HAVE_NEON)
 const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+             AOM_BITS_10),
+  make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+             AOM_BITS_12),
+  make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+             AOM_BITS_8),
+#endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
              &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
   make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_neon>,
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 84330d6..d41884d 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -37,7 +37,7 @@
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLarge() {}
+  ~ErrorResilienceTestLarge() override = default;
 
   void Reset() {
     error_nframes_ = 0;
@@ -58,9 +58,9 @@
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual void SetUp() { InitializeConfig(encoding_mode_); }
+  void SetUp() override { InitializeConfig(encoding_mode_); }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = 0.0;
     nframes_ = 0;
     decoded_nframes_ = 0;
@@ -68,13 +68,13 @@
     mismatch_nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, enable_altref_);
@@ -146,7 +146,7 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     // Check that the encode frame flags are correctly reflected
     // in the output frame flags.
     const int encode_flags = pkt->data.frame.flags >> 16;
@@ -176,21 +176,21 @@
     return 0.0;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     if (error_nframes_ > 0 &&
         (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < error_nframes_; ++i) {
         if (error_frames_[i] == nframes_ - 1) {
           std::cout << "             Skipping decoding frame: "
                     << error_frames_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     }
-    return 1;
+    return true;
   }
 
-  virtual bool DoDecodeInvisible() const {
+  bool DoDecodeInvisible() const override {
     if (invisible_error_nframes_ > 0 &&
         (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < invisible_error_nframes_; ++i) {
@@ -198,14 +198,14 @@
           std::cout << "             Skipping decoding all invisible frames in "
                        "frame pkt: "
                     << invisible_error_frames_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     }
-    return 1;
+    return true;
   }
 
-  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+  void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
     if (allow_mismatch_) {
       double mismatch_psnr = compute_psnr(img1, img2);
       mismatch_psnr_ += mismatch_psnr;
@@ -216,8 +216,8 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     (void)img;
     (void)pts;
     ++decoded_nframes_;
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 6b7fcce..ce45394 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -40,9 +40,9 @@
     firstpass_stats_.buf = nullptr;
     firstpass_stats_.sz = 0;
   }
-  virtual ~AVxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+  ~AVxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
 
     cfg_.g_lag_in_frames = 35;
@@ -53,18 +53,18 @@
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     abort_ = false;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     // For first pass stats test, only run first pass encoder.
     if (cfg_.g_pass == AOM_RC_FIRST_PASS) abort_ = true;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                          ::libaom_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode in 2-pass mode.
       SetTileSize(encoder);
@@ -84,7 +84,7 @@
     encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
 
-  virtual void StatsPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void StatsPktHook(const aom_codec_cx_pkt_t *pkt) override {
     const uint8_t *const pkt_buf =
         reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
     const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -227,9 +227,9 @@
     md5_dec_.clear();
     md5_enc_.clear();
   }
-  virtual ~AVxEncoderThreadTest() { delete decoder_; }
+  ~AVxEncoderThreadTest() override { delete decoder_; }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
 
     if (encoding_mode_ == ::libaom_test::kOnePassGood ||
@@ -244,12 +244,12 @@
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                          ::libaom_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       SetTileSize(encoder);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -290,7 +290,7 @@
     encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     size_enc_.push_back(pkt->data.frame.sz);
 
     ::libaom_test::MD5 md5_enc;
@@ -531,15 +531,15 @@
 #endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
-  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+  void SetTileSize(libaom_test::Encoder *encoder) override {
     encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
     encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
 
-  virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
-                                const std::vector<size_t> ref_size_enc,
-                                const std::vector<std::string> ref_md5_enc,
-                                const std::vector<std::string> ref_md5_dec) {
+  void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+                        const std::vector<size_t> ref_size_enc,
+                        const std::vector<std::string> ref_md5_enc,
+                        const std::vector<std::string> ref_md5_dec) override {
     (void)video;
     (void)ref_size_enc;
     (void)ref_md5_enc;
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index ea7ed50..8f16c4e 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -214,13 +214,12 @@
       : DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)),
         md5_file_(nullptr), num_buffers_(0) {}
 
-  virtual ~ExternalFrameBufferMD5Test() {
+  ~ExternalFrameBufferMD5Test() override {
     if (md5_file_ != nullptr) fclose(md5_file_);
   }
 
-  virtual void PreDecodeFrameHook(
-      const libaom_test::CompressedVideoSource &video,
-      libaom_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+                          libaom_test::Decoder *decoder) override {
     if (num_buffers_ > 0 && video.frame_number() == 0) {
       // Have libaom use frame buffers we create.
       ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
@@ -236,8 +235,8 @@
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
@@ -315,7 +314,7 @@
   ExternalFrameBufferTest()
       : video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libaom_test::WebMVideoSource(kAV1TestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
@@ -327,7 +326,7 @@
     ASSERT_NE(decoder_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete decoder_;
     decoder_ = nullptr;
     delete video_;
@@ -383,7 +382,7 @@
 
 class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 046d810..9cbf208 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -59,10 +59,10 @@
 class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
                      public ::testing::TestWithParam<Fdct4x4Param<OutputType>> {
  public:
-  virtual ~Trans4x4FDCT() {}
+  ~Trans4x4FDCT() override = default;
 
   using TxfmBaseOutType = libaom_test::TransformTestBase<OutputType>;
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = std::get<0>(this->GetParam());
     TxfmBaseOutType::pitch_ = 4;
     TxfmBaseOutType::height_ = 4;
@@ -71,14 +71,13 @@
     TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
     TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
   }
-  virtual void TearDown() {}
 
  protected:
-  void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
 
-  void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) override {
     (void)out;
     (void)dst;
     (void)stride;
diff --git a/test/fft_test.cc b/test/fft_test.cc
index 5443c99..06a17a3 100644
--- a/test/fft_test.cc
+++ b/test/fft_test.cc
@@ -88,7 +88,7 @@
 
 class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
  protected:
-  void SetUp() {
+  void SetUp() override {
     int n = GetParam().n;
     input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
     temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
@@ -100,7 +100,7 @@
     memset(temp_, 0, sizeof(*temp_) * n * n);
     memset(output_, 0, sizeof(*output_) * n * n * 2);
   }
-  void TearDown() {
+  void TearDown() override {
     aom_free(input_);
     aom_free(temp_);
     aom_free(output_);
@@ -178,7 +178,7 @@
 
 class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
  protected:
-  void SetUp() {
+  void SetUp() override {
     int n = GetParam().n;
     input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
     temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
@@ -190,7 +190,7 @@
     memset(temp_, 0, sizeof(*temp_) * n * n * 2);
     memset(output_, 0, sizeof(*output_) * n * n);
   }
-  void TearDown() {
+  void TearDown() override {
     aom_free(input_);
     aom_free(temp_);
     aom_free(output_);
diff --git a/test/film_grain_table_test.cc b/test/film_grain_table_test.cc
index f8937f1..808d966 100644
--- a/test/film_grain_table_test.cc
+++ b/test/film_grain_table_test.cc
@@ -91,7 +91,7 @@
 
   // Extend the existing segment
   aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0);
-  EXPECT_EQ(0, table.head->next);
+  EXPECT_EQ(nullptr, table.head->next);
 
   // Lookup and remove and check that the entry is no longer there
   EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
@@ -100,8 +100,8 @@
   EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain));
   EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
 
-  EXPECT_EQ(0, table.head);
-  EXPECT_EQ(0, table.tail);
+  EXPECT_EQ(nullptr, table.head);
+  EXPECT_EQ(nullptr, table.tail);
   aom_film_grain_table_free(&table);
 }
 
@@ -114,8 +114,8 @@
   aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
   EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 1100, true, &grain));
 
-  EXPECT_EQ(0, table.head);
-  EXPECT_EQ(0, table.tail);
+  EXPECT_EQ(nullptr, table.head);
+  EXPECT_EQ(nullptr, table.tail);
   aom_film_grain_table_free(&table);
 }
 
@@ -180,7 +180,7 @@
 
 class FilmGrainTableIOTest : public ::testing::Test {
  protected:
-  void SetUp() { memset(&error_, 0, sizeof(error_)); }
+  void SetUp() override { memset(&error_, 0, sizeof(error_)); }
   struct aom_internal_error_info error_;
 };
 
@@ -280,32 +280,35 @@
 };
 
 class FilmGrainEncodeTest
-    : public ::libaom_test::CodecTestWith2Params<bool, ::libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith3Params<int, int,
+                                                 ::libaom_test::TestMode>,
       public ::libaom_test::EncoderTest {
  protected:
   FilmGrainEncodeTest()
       : EncoderTest(GET_PARAM(0)), test_monochrome_(GET_PARAM(1)),
-        test_mode_(GET_PARAM(2)) {}
+        key_frame_dist_(GET_PARAM(2)), test_mode_(GET_PARAM(3)) {}
   ~FilmGrainEncodeTest() override = default;
 
   void SetUp() override {
     InitializeConfig(test_mode_);
-    cfg_.monochrome = test_monochrome_;
+    cfg_.monochrome = test_monochrome_ == 1;
     cfg_.rc_target_bitrate = 300;
-    cfg_.kf_max_dist = 0;
+    cfg_.kf_max_dist = key_frame_dist_;
+    cfg_.g_lag_in_frames = 0;
   }
 
   void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                           ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_CPUUSED,
+                       test_mode_ == ::libaom_test::kRealTime ? 7 : 5);
       encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_FILM);
       encoder->Control(AV1E_SET_DENOISE_NOISE_LEVEL, 1);
     } else if (video->frame() == 1) {
-      cfg_.monochrome = 0;
+      cfg_.monochrome = (test_monochrome_ == 1 || test_monochrome_ == 2);
       encoder->Config(&cfg_);
     } else {
-      cfg_.monochrome = test_monochrome_;
+      cfg_.monochrome = test_monochrome_ == 1;
       encoder->Config(&cfg_);
     }
   }
@@ -313,11 +316,6 @@
   bool DoDecode() const override { return false; }
 
   void DoTest() {
-    if (test_monochrome_ && test_mode_ == ::libaom_test::kRealTime) {
-      // TODO(bohanli): Running real time mode with monochrome will cause the
-      // encoder to crash. Check if this is intended or there is a bug.
-      GTEST_SKIP();
-    }
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 3);
     cfg_.g_w = video.img()->d_w;
@@ -326,11 +324,58 @@
   }
 
  private:
-  bool test_monochrome_;
+  // 0: monochroome always off.
+  // 1: monochrome always on.
+  // 2: monochrome changes from 0, 1, 0, for encoded frames 0, 1, 2.
+  // The case where monochrome changes from 1 to 0 (i.e., encoder initialized
+  // with monochrome = 1 and then subsequently encoded with monochrome = 0)
+  // will fail. The test InitMonochrome1_EncodeMonochrome0 below verifies this.
+  int test_monochrome_;
+  int key_frame_dist_;
   ::libaom_test::TestMode test_mode_;
 };
 
 TEST_P(FilmGrainEncodeTest, Test) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_SUITE(FilmGrainEncodeTest, ::testing::Bool(),
+AV1_INSTANTIATE_TEST_SUITE(FilmGrainEncodeTest, ::testing::Range(0, 3),
+                           ::testing::Values(0, 10),
                            ::testing::ValuesIn(kFilmGrainEncodeTestModes));
+
+// Initialize encoder with monochrome = 1, and then encode frame with
+// monochrome = 0. This will result in an error: see the following check
+// in encoder_set_config() in av1/av1_cx_iface.c.
+// TODO(marpan): Consider moving this test to another file, as the failure
+// has nothing to do with film grain mode.
+TEST(FilmGrainEncodeTest, InitMonochrome1EncodeMonochrome0) {
+  const int kWidth = 352;
+  const int kHeight = 288;
+  const int usage = AOM_USAGE_REALTIME;
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+  aom_codec_ctx_t enc;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  // Initialize encoder, with monochrome = 0.
+  cfg.monochrome = 1;
+  aom_codec_err_t init_status = aom_codec_enc_init(&enc, iface, &cfg, 0);
+  ASSERT_EQ(init_status, AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 7), AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TUNE_CONTENT, AOM_CONTENT_FILM),
+            AOM_CODEC_OK);
+  ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DENOISE_NOISE_LEVEL, 1),
+            AOM_CODEC_OK);
+  // Set image with zero values.
+  constexpr size_t kBufferSize =
+      kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+  std::vector<unsigned char> buffer(kBufferSize);
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               buffer.data()));
+  // Encode first frame.
+  ASSERT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+  // Second frame: update config with monochrome = 1.
+  cfg.monochrome = 0;
+  ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_INVALID_PARAM);
+  ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index c54bec5..0a0ab11 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc
@@ -41,8 +41,8 @@
 
 class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
  public:
-  virtual ~AV1FilterIntraPredTest() {}
-  virtual void SetUp() {
+  ~AV1FilterIntraPredTest() override = default;
+  void SetUp() override {
     PredFuncMode funcMode = GET_PARAM(0);
     predFuncRef_ = std::get<0>(funcMode);
     predFunc_ = std::get<1>(funcMode);
@@ -57,7 +57,7 @@
     ASSERT_NE(pred_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete[] alloc_;
     delete[] predRef_;
     delete[] pred_;
diff --git a/test/forced_max_frame_width_height_test.cc b/test/forced_max_frame_width_height_test.cc
index 2e019b6..3347713 100644
--- a/test/forced_max_frame_width_height_test.cc
+++ b/test/forced_max_frame_width_height_test.cc
@@ -114,10 +114,7 @@
   }
 }
 
-// A test that reproduces bug aomedia:3348: Assertion
-// `ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride'
-// failed.
-TEST(EncodeForcedMaxFrameWidthHeight, DISABLED_DimensionDecreasing) {
+TEST(EncodeForcedMaxFrameWidthHeight, DimensionDecreasing) {
   constexpr int kWidth = 128;
   constexpr int kHeight = 128;
   constexpr size_t kBufferSize = 3 * kWidth * kHeight;
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
deleted file mode 100644
index c355efc..0000000
--- a/test/frame_error_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-namespace {
-typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
-                                    const uint8_t *const dst, int p_width,
-                                    int p_height, int p_stride);
-#if HAVE_AVX2 || HAVE_SSE2
-const int kBlockWidth[] = {
-  832, 834, 640, 1280, 1920,
-};
-const int kBlockHeight[] = {
-  480, 482, 360, 720, 1080,
-};
-#endif
-typedef std::tuple<frame_error_func, int, int> FrameErrorParam;
-
-class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
- public:
-  virtual ~AV1FrameErrorTest() {}
-  virtual void SetUp() {
-    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  }
-  virtual void TearDown() {}
-
- protected:
-  void RandomValues(frame_error_func test_impl, int width, int height);
-  void ExtremeValues(frame_error_func test_impl, int width, int height);
-  void RunSpeedTest(frame_error_func test_impl, int width, int height);
-  libaom_test::ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FrameErrorTest);
-
-void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
-                                     int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = rnd_.Rand8();
-    ref[i] = rnd_.Rand8();
-  }
-  const int64_t ref_error =
-      av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
-  const int64_t test_error = test_impl(ref, stride, dst, width, height, stride);
-  ASSERT_EQ(test_error, ref_error) << width << "x" << height;
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
-                                      int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  for (int r = 0; r < 2; r++) {
-    if (r == 0) {
-      memset(dst, 0, max_blk_size);
-      memset(ref, 255, max_blk_size);
-    } else if (r == 1) {
-      memset(dst, 255, max_blk_size);
-      memset(ref, 0, max_blk_size);
-    }
-    const int64_t ref_error =
-        av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
-    const int64_t test_error =
-        test_impl(ref, stride, dst, width, height, stride);
-    ASSERT_EQ(test_error, ref_error) << width << "x" << height;
-  }
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
-                                     int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = ref[i] = rnd_.Rand8();
-  }
-  const int num_loops = 10000000 / (width + height);
-  frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
-  double elapsed_time[2] = { 0 };
-  for (int i = 0; i < 2; ++i) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    frame_error_func func = funcs[i];
-    for (int j = 0; j < num_loops; ++j) {
-      func(ref, stride, dst, width, height, stride);
-    }
-    aom_usec_timer_mark(&timer);
-    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    elapsed_time[i] = 1000.0 * time / num_loops;
-  }
-  aom_free(dst);
-  aom_free(ref);
-  printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height,
-         elapsed_time[0], elapsed_time[1]);
-  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1FrameErrorTest, CheckOutput) {
-  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-}  // namespace
diff --git a/test/frame_parallel_enc_test.cc b/test/frame_parallel_enc_test.cc
index 7508eb7..86d5ddb 100644
--- a/test/frame_parallel_enc_test.cc
+++ b/test/frame_parallel_enc_test.cc
@@ -36,9 +36,9 @@
     cfg.allow_lowbitdepth = 1;
     decoder_ = codec_->CreateDecoder(cfg, 0);
   }
-  virtual ~AVxFrameParallelThreadEncodeTest() { delete decoder_; }
+  ~AVxFrameParallelThreadEncodeTest() override { delete decoder_; }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.rc_end_usage = AOM_VBR;
     cfg_.g_lag_in_frames = 35;
@@ -49,12 +49,12 @@
     cfg_.g_threads = 16;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                          ::libaom_test::Encoder *encoder) override {
     if (encoder_initialized_) return;
     SetTileSize(encoder);
     encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -73,7 +73,7 @@
     encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     size_enc_.push_back(pkt->data.frame.sz);
 
     ::libaom_test::MD5 md5_enc;
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 3b35db8..ea8cf47 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -24,18 +24,18 @@
  protected:
   AV1FrameSizeTests()
       : EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {}
-  virtual ~AV1FrameSizeTests() {}
+  ~AV1FrameSizeTests() override = default;
 
-  virtual void SetUp() { InitializeConfig(::libaom_test::kRealTime); }
+  void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 7);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -312,15 +312,13 @@
                        ::testing::Range(6, 11)));
 
 #if !CONFIG_REALTIME_ONLY
-// TODO(https://crbug.com/aomedia/3348): Modes that use av1_full_pixel_search()
-// will cause an assert.
 INSTANTIATE_TEST_SUITE_P(
-    DISABLED_GoodQuality, AV1ResolutionChange,
+    GoodQuality, AV1ResolutionChange,
     ::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
                        ::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
                        ::testing::Range(2, 6)));
 INSTANTIATE_TEST_SUITE_P(
-    DISABLED_GoodQualityLarge, AV1ResolutionChange,
+    GoodQualityLarge, AV1ResolutionChange,
     ::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
                        ::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
                        ::testing::Range(0, 2)));
@@ -350,18 +348,18 @@
   AV1LosslessFrameSizeTests()
       : EncoderTest(GET_PARAM(0)), frame_size_param_(GET_PARAM(1)),
         encoding_mode_(GET_PARAM(2)) {}
-  virtual ~AV1LosslessFrameSizeTests() {}
+  ~AV1LosslessFrameSizeTests() override = default;
 
-  virtual void SetUp() { InitializeConfig(encoding_mode_); }
+  void SetUp() override { InitializeConfig(encoding_mode_); }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 6);
       encoder->Control(AV1E_SET_LOSSLESS, 1);
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
index fc2a769..2268b9f 100644
--- a/test/function_equivalence_test.h
+++ b/test/function_equivalence_test.h
@@ -55,11 +55,9 @@
  public:
   FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {}
 
-  virtual ~FunctionEquivalenceTest() {}
+  ~FunctionEquivalenceTest() override = default;
 
-  virtual void SetUp() { params_ = this->GetParam(); }
-
-  virtual void TearDown() {}
+  void SetUp() override { params_ = this->GetParam(); }
 
  protected:
   ACMRandom rng_;
diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index 9d27db8..bb9e218 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc
@@ -67,9 +67,9 @@
 class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
                     public ::testing::TestWithParam<Dct4x4Param> {
  public:
-  virtual ~Trans4x4WHT() {}
+  ~Trans4x4WHT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     pitch_ = 4;
@@ -80,13 +80,12 @@
     num_coeffs_ = GET_PARAM(4);
     fwd_txfm_c_ = GET_PARAM(5);
   }
-  virtual void TearDown() {}
 
  protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
   void RunSpeedTest() {
diff --git a/test/gf_pyr_height_test.cc b/test/gf_pyr_height_test.cc
index a2d1a8f..0996d80 100644
--- a/test/gf_pyr_height_test.cc
+++ b/test/gf_pyr_height_test.cc
@@ -79,9 +79,9 @@
     gf_max_pyr_height_ = GET_PARAM(3).gf_max_pyr_height;
     psnr_threshold_ = GET_PARAM(3).psnr_thresh;
   }
-  virtual ~GFPyrHeightTest() {}
+  ~GFPyrHeightTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -95,18 +95,18 @@
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       if (rc_mode_ == AOM_Q) {
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index fc306e6..b01e78f 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -240,7 +240,7 @@
     shift_ = do_shift;
   }
 
-  virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
   // The Rand() function generates values in the range [-((1 << BitDepth) - 1),
   // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
@@ -252,7 +252,7 @@
 
   void CompareReferenceRandom() {
     const int kMaxBlockSize = 32 * 32;
-    const int block_size_ = bw_ * bh_;
+    const int block_size = bw_ * bh_;
 
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
     DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
@@ -262,13 +262,13 @@
     OutputType b_ref[kMaxBlockSize];
     memset(b_ref, 0, sizeof(b_ref));
 
-    for (int i = 0; i < block_size_; ++i) a[i] = Rand();
+    for (int i = 0; i < block_size; ++i) a[i] = Rand();
     ReferenceHadamard(a, bw_, b_ref, bw_, bh_, shift_);
     API_REGISTER_STATE_CHECK(h_func_(a, bw_, b));
 
     // The order of the output is not important. Sort before checking.
-    std::sort(b, b + block_size_);
-    std::sort(b_ref, b_ref + block_size_);
+    std::sort(b, b + block_size);
+    std::sort(b_ref, b_ref + block_size);
     EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
   }
 
@@ -298,12 +298,12 @@
 
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
-    const int block_size_ = bw_ * bh_;
+    const int block_size = bw_ * bh_;
 
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
     DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
-    for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
+    for (int i = 0; i < block_size * 8; ++i) a[i] = Rand();
 
     OutputType b_ref[kMaxBlockSize];
     for (int i = 8; i < 64; i += 8) {
@@ -314,8 +314,8 @@
       API_REGISTER_STATE_CHECK(h_func_(a, i, b));
 
       // The order of the output is not important. Sort before checking.
-      std::sort(b, b + block_size_);
-      std::sort(b_ref, b_ref + block_size_);
+      std::sort(b, b + block_size);
+      std::sort(b_ref, b_ref + block_size);
       EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
     }
   }
@@ -338,6 +338,7 @@
     printf("Hadamard%dx%d[%12d runs]: %d us\n", bw_, bh_, times, elapsed_time);
   }
 
+ protected:
   ACMRandom rnd_;
 
  private:
@@ -351,7 +352,7 @@
  public:
   HadamardLowbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
   // Use values between -255 (0xFF01) and 255 (0x00FF)
-  virtual int16_t Rand() {
+  int16_t Rand() override {
     int16_t src = rnd_.Rand8();
     int16_t pred = rnd_.Rand8();
     return src - pred;
@@ -407,7 +408,7 @@
  protected:
   HadamardHighbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
   // Use values between -4095 (0xF001) and 4095 (0x0FFF)
-  virtual int16_t Rand() {
+  int16_t Rand() override {
     int16_t src = rnd_.Rand12();
     int16_t pred = rnd_.Rand12();
     return src - pred;
@@ -431,6 +432,15 @@
         HadamardFuncWithSize(&aom_highbd_hadamard_16x16_c, 16, 16),
         HadamardFuncWithSize(&aom_highbd_hadamard_32x32_c, 32, 32)));
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardHighbdTest,
+    ::testing::Values(
+        HadamardFuncWithSize(&aom_highbd_hadamard_8x8_avx2, 8, 8),
+        HadamardFuncWithSize(&aom_highbd_hadamard_16x16_avx2, 16, 16),
+        HadamardFuncWithSize(&aom_highbd_hadamard_32x32_avx2, 32, 32)));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardHighbdTest,
@@ -447,7 +457,7 @@
  public:
   HadamardLowbdLPTest() : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
   // Use values between -255 (0xFF01) and 255 (0x00FF)
-  virtual int16_t Rand() {
+  int16_t Rand() override {
     int16_t src = rnd_.Rand8();
     int16_t pred = rnd_.Rand8();
     return src - pred;
@@ -497,7 +507,7 @@
   HadamardLowbdLP8x8DualTest()
       : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
   // Use values between -255 (0xFF01) and 255 (0x00FF)
-  virtual int16_t Rand() {
+  int16_t Rand() override {
     int16_t src = rnd_.Rand8();
     int16_t pred = rnd_.Rand8();
     return src - pred;
diff --git a/test/hash_test.cc b/test/hash_test.cc
index 61e0b51..a1de932 100644
--- a/test/hash_test.cc
+++ b/test/hash_test.cc
@@ -31,10 +31,10 @@
 
 class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
  public:
-  ~AV1Crc32cHashTest();
-  void SetUp();
+  ~AV1Crc32cHashTest() override;
+  void SetUp() override;
 
-  void TearDown();
+  void TearDown() override;
 
  protected:
   void RunCheckOutput(get_crc32c_value_func test_impl);
@@ -49,7 +49,7 @@
   size_t length_;
 };
 
-AV1Crc32cHashTest::~AV1Crc32cHashTest() {}
+AV1Crc32cHashTest::~AV1Crc32cHashTest() = default;
 
 void AV1Crc32cHashTest::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 074213a..303d580 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -94,7 +94,7 @@
 
 class HBDMetricsTestBase {
  public:
-  virtual ~HBDMetricsTestBase() {}
+  virtual ~HBDMetricsTestBase() = default;
 
  protected:
   void RunAccuracyCheck() {
@@ -179,14 +179,13 @@
 class HBDMetricsTest : public HBDMetricsTestBase,
                        public ::testing::TestWithParam<MetricTestTParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     lbd_metric_ = GET_PARAM(0);
     hbd_metric_ = GET_PARAM(1);
     input_bit_depth_ = GET_PARAM(2);
     bit_depth_ = GET_PARAM(3);
     threshold_ = GET_PARAM(4);
   }
-  virtual void TearDown() {}
 };
 
 TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); }
diff --git a/test/hiprec_convolve_test.cc b/test/hiprec_convolve_test.cc
index 3e93a06..78883cc 100644
--- a/test/hiprec_convolve_test.cc
+++ b/test/hiprec_convolve_test.cc
@@ -47,7 +47,7 @@
 #endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
-#if HAVE_SSSE3 || HAVE_AVX2
+#if HAVE_SSSE3 || HAVE_AVX2 || HAVE_NEON
 TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(4));
 }
@@ -64,6 +64,12 @@
                          libaom_test::AV1HighbdHiprecConvolve::BuildParams(
                              av1_highbd_wiener_convolve_add_src_avx2));
 #endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdHiprecConvolveTest,
+                         libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                             av1_highbd_wiener_convolve_add_src_neon));
+#endif
 #endif
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
diff --git a/test/hiprec_convolve_test_util.cc b/test/hiprec_convolve_test_util.cc
index e2496b3..6d7902f 100644
--- a/test/hiprec_convolve_test_util.cc
+++ b/test/hiprec_convolve_test_util.cc
@@ -26,21 +26,21 @@
 static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
                              InterpKernel vkernel, int kernel_type = 2) {
   if (kernel_type == 0) {
-    // Low possible values for filter coefficients
+    // Low possible values for filter coefficients, 7-tap kernel
     hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
     hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
     hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
     hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
     hkernel[7] = vkernel[7] = 0;
   } else if (kernel_type == 1) {
-    // Max possible values for filter coefficients
+    // Max possible values for filter coefficients, 7-tap kernel
     hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
     hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
     hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
     hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
     hkernel[7] = vkernel[7] = 0;
-  } else {
-    // Randomly generated values for filter coefficients
+  } else if (kernel_type == 2) {
+    // Randomly generated values for filter coefficients, 7-tap kernel
     hkernel[0] = hkernel[6] =
         WIENER_FILT_TAP0_MINV +
         rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
@@ -64,6 +64,41 @@
         rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
     vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
     vkernel[7] = 0;
+  } else if (kernel_type == 3) {
+    // Low possible values for filter coefficients, 5-tap kernel
+    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
+    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
+    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
+    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = vkernel[7] = 0;
+  } else if (kernel_type == 4) {
+    // Max possible values for filter coefficients, 5-tap kernel
+    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
+    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
+    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
+    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = vkernel[7] = 0;
+  } else {
+    // Randomly generated values for filter coefficients, 5-tap kernel
+    hkernel[0] = hkernel[6] = 0;
+    hkernel[1] = hkernel[5] =
+        WIENER_FILT_TAP1_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+    hkernel[2] = hkernel[4] =
+        WIENER_FILT_TAP2_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+    hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = 0;
+
+    vkernel[0] = vkernel[6] = 0;
+    vkernel[1] = vkernel[5] =
+        WIENER_FILT_TAP1_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
+    vkernel[2] = vkernel[4] =
+        WIENER_FILT_TAP2_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
+    vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+    vkernel[7] = 0;
   }
 }
 
@@ -83,19 +118,17 @@
   return ::testing::ValuesIn(params);
 }
 
-AV1HiprecConvolveTest::~AV1HiprecConvolveTest() {}
+AV1HiprecConvolveTest::~AV1HiprecConvolveTest() = default;
 void AV1HiprecConvolveTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HiprecConvolveTest::TearDown() {}
-
 void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
   const int w = 128, h = 128;
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
   const int num_iters = GET_PARAM(2);
   int i, j, k, m;
-  const ConvolveParams conv_params = get_conv_params_wiener(8);
+  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
 
   std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
   ASSERT_NE(input_, nullptr);
@@ -114,7 +147,7 @@
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
   DECLARE_ALIGNED(16, InterpKernel, vkernel);
 
-  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+  for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
     generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
     for (i = 0; i < num_iters; ++i) {
       for (k = 0; k < h; ++k)
@@ -141,7 +174,7 @@
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
   const int num_iters = GET_PARAM(2) / 500;
   int i, j, k;
-  const ConvolveParams conv_params = get_conv_params_wiener(8);
+  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
 
   std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
   ASSERT_NE(input_, nullptr);
@@ -217,13 +250,11 @@
   return ::testing::ValuesIn(params);
 }
 
-AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() {}
+AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() = default;
 void AV1HighbdHiprecConvolveTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdHiprecConvolveTest::TearDown() {}
-
 void AV1HighbdHiprecConvolveTest::RunCheckOutput(
     highbd_hiprec_convolve_func test_impl) {
   const int w = 128, h = 128;
@@ -231,7 +262,7 @@
   const int num_iters = GET_PARAM(2);
   const int bd = GET_PARAM(3);
   int i, j;
-  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+  const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
 
   std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
   ASSERT_NE(input, nullptr);
@@ -255,7 +286,7 @@
   uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
   uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
   uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
-  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+  for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
     generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
     for (i = 0; i < num_iters; ++i) {
       // Choose random locations within the source block
@@ -282,7 +313,7 @@
   const int num_iters = GET_PARAM(2) / 500;
   const int bd = GET_PARAM(3);
   int i, j, k;
-  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+  const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
 
   std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
   ASSERT_NE(input, nullptr);
diff --git a/test/hiprec_convolve_test_util.h b/test/hiprec_convolve_test_util.h
index e064ba6..beae5c7 100644
--- a/test/hiprec_convolve_test_util.h
+++ b/test/hiprec_convolve_test_util.h
@@ -34,7 +34,7 @@
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h,
-                                     const ConvolveParams *conv_params);
+                                     const WienerConvolveParams *conv_params);
 
 typedef std::tuple<int, int, int, hiprec_convolve_func> HiprecConvolveParam;
 
@@ -44,10 +44,8 @@
 class AV1HiprecConvolveTest
     : public ::testing::TestWithParam<HiprecConvolveParam> {
  public:
-  virtual ~AV1HiprecConvolveTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
+  ~AV1HiprecConvolveTest() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(hiprec_convolve_func test_impl);
@@ -64,7 +62,7 @@
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
     const int16_t *filter_y, int y_step_q4, int w, int h,
-    const ConvolveParams *conv_params, int bps);
+    const WienerConvolveParams *conv_params, int bps);
 
 typedef std::tuple<int, int, int, int, highbd_hiprec_convolve_func>
     HighbdHiprecConvolveParam;
@@ -75,10 +73,8 @@
 class AV1HighbdHiprecConvolveTest
     : public ::testing::TestWithParam<HighbdHiprecConvolveParam> {
  public:
-  virtual ~AV1HighbdHiprecConvolveTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
+  ~AV1HighbdHiprecConvolveTest() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index 2873490..5e397ff 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -33,12 +33,12 @@
 
 class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     data_buf_ = (int16_t *)aom_malloc(MAX_SB_SQUARE * sizeof(int16_t));
     ASSERT_NE(data_buf_, nullptr);
     target_func_ = GET_PARAM(0);
   }
-  virtual void TearDown() { aom_free(data_buf_); }
+  void TearDown() override { aom_free(data_buf_); }
   void RunHorverTest();
   void RunHorverTest_ExtremeValues();
   void RunHorverSpeedTest(int run_times);
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index cba29e9..595ed54 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -100,9 +100,9 @@
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {}
 
-  virtual ~HorzSuperresEndToEndTest() {}
+  ~HorzSuperresEndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_Q;
@@ -118,18 +118,18 @@
     cfg_.rc_superres_mode = superres_mode_;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     frame_count_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     frame_count_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
@@ -203,9 +203,9 @@
     superres_kf_denom_ = std::get<1>(denoms);
   }
 
-  virtual ~HorzSuperresFixedEndToEndTest() {}
+  ~HorzSuperresFixedEndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_VBR;
@@ -223,18 +223,18 @@
     cfg_.rc_superres_kf_denominator = superres_kf_denom_;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     frame_count_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     frame_count_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
@@ -313,9 +313,9 @@
     superres_kf_qthresh_ = std::get<1>(qthresholds);
   }
 
-  virtual ~HorzSuperresQThreshEndToEndTest() {}
+  ~HorzSuperresQThreshEndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_VBR;
@@ -333,18 +333,18 @@
     cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     frame_count_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     frame_count_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
diff --git a/test/intra_edge_test.cc b/test/intra_edge_test.cc
index 84e712d..96ee654 100644
--- a/test/intra_edge_test.cc
+++ b/test/intra_edge_test.cc
@@ -37,7 +37,7 @@
   static const int kBufSize = 2 * 64 + 32;
   static const int kOffset = 16;
 
-  virtual ~UpsampleTest() {}
+  ~UpsampleTest() override = default;
 
   virtual void Execute(T *edge_tst) = 0;
 
@@ -62,16 +62,12 @@
   int size_;
 };
 
-//////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
 typedef void (*UP8B)(uint8_t *p, int size);
 typedef libaom_test::FuncParam<UP8B> TestFuncs;
 
 class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
  protected:
-  void Execute(uint8_t *edge_tst) {
+  void Execute(uint8_t *edge_tst) override {
     params_.ref_func(edge_ref_, size_);
     API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
   }
@@ -99,6 +95,18 @@
   }
 }
 
+TEST_P(UpsampleTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+  }
+}
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, UpsampleTest8B,
@@ -106,16 +114,110 @@
                                 av1_upsample_intra_edge_sse4_1)));
 #endif  // HAVE_SSE4_1
 
-//////////////////////////////////////////////////////////////////////////////
-// High bit-depth version
-//////////////////////////////////////////////////////////////////////////////
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, UpsampleTest8B,
+    ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+                                av1_upsample_intra_edge_neon)));
+#endif  // HAVE_NEON
+
+template <typename F, typename T>
+class FilterEdgeTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMaxEdge = 2 * 64;
+  static const int kBufSize = kMaxEdge + 32;
+  static const int kOffset = 15;
+
+  ~FilterEdgeTest() override = default;
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    for (int r = 0; r < size_; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+  int strength_;
+};
+
+typedef void (*FE8B)(uint8_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
+
+class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) override {
+    params_.ref_func(edge_ref_, size_, strength_);
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+};
+
+TEST_P(FilterEdgeTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = strength_ == 3 ? 1 : strength_ + 1;
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, FilterEdgeTest8B,
+    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+                                          av1_filter_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, FilterEdgeTest8B,
+    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+                                          av1_filter_intra_edge_neon)));
+#endif  // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
 
 typedef void (*UPHB)(uint16_t *p, int size, int bd);
 typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
 
 class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
  protected:
-  void Execute(uint16_t *edge_tst) {
+  void Execute(uint16_t *edge_tst) override {
     params_.ref_func(edge_ref_, size_, bit_depth_);
     API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
   }
@@ -151,94 +253,40 @@
   }
 }
 
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, UpsampleTestHB,
-    ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c,
-                                   av1_upsample_intra_edge_high_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-template <typename F, typename T>
-class FilterEdgeTest : public FunctionEquivalenceTest<F> {
- protected:
-  static const int kIterations = 1000000;
-  static const int kMaxEdge = 2 * 64;
-  static const int kBufSize = kMaxEdge + 32;
-  static const int kOffset = 15;
-
-  virtual ~FilterEdgeTest() {}
-
-  virtual void Execute(T *edge_tst) = 0;
-
-  void Common() {
-    edge_ref_ = &edge_ref_data_[kOffset];
-    edge_tst_ = &edge_tst_data_[kOffset];
-
-    Execute(edge_tst_);
-
-    for (int r = 0; r < size_; ++r) {
-      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
-    }
+TEST_P(UpsampleTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
   }
-
-  T edge_ref_data_[kBufSize];
-  T edge_tst_data_[kBufSize];
-
-  T *edge_ref_;
-  T *edge_tst_;
-
-  int size_;
-  int strength_;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*FE8B)(uint8_t *p, int size, int strength);
-typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
-
-class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
- protected:
-  void Execute(uint8_t *edge_tst) {
-    params_.ref_func(edge_ref_, size_, strength_);
-    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
-  }
-};
-
-TEST_P(FilterEdgeTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    strength_ = this->rng_(4);
-    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
-
-    int i, pix = 0;
-    for (i = 0; i < kOffset + size_; ++i) {
-      pix = rng_.Rand8();
-      edge_ref_data_[i] = pix;
-      edge_tst_data_[i] = pix;
-    }
-
-    Common();
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
   }
 }
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, FilterEdgeTest8B,
-    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
-                                          av1_filter_intra_edge_sse4_1)));
+    SSE4_1, UpsampleTestHB,
+    ::testing::Values(TestFuncsHBD(av1_highbd_upsample_intra_edge_c,
+                                   av1_highbd_upsample_intra_edge_sse4_1)));
 #endif  // HAVE_SSE4_1
 
-//////////////////////////////////////////////////////////////////////////////
-// High bit-depth version
-//////////////////////////////////////////////////////////////////////////////
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, UpsampleTestHB,
+    ::testing::Values(TestFuncsHBD(av1_highbd_upsample_intra_edge_c,
+                                   av1_highbd_upsample_intra_edge_neon)));
+#endif  // HAVE_NEON
 
 typedef void (*FEHB)(uint16_t *p, int size, int strength);
 typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
 
 class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
  protected:
-  void Execute(uint16_t *edge_tst) {
+  void Execute(uint16_t *edge_tst) override {
     params_.ref_func(edge_ref_, size_, strength_);
     API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
   }
@@ -267,56 +315,6 @@
   }
 }
 
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
-                         ::testing::Values(FilterEdgeTestFuncsHBD(
-                             av1_filter_intra_edge_high_c,
-                             av1_filter_intra_edge_high_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-// Speed tests
-
-TEST_P(UpsampleTest8B, DISABLED_Speed) {
-  const int test_count = 10000000;
-  size_ = kMaxEdge;
-  for (int i = 0; i < kOffset + size_; ++i) {
-    edge_tst_data_[i] = rng_.Rand8();
-  }
-  edge_tst_ = &edge_tst_data_[kOffset];
-  for (int iter = 0; iter < test_count; ++iter) {
-    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
-  }
-}
-
-TEST_P(UpsampleTestHB, DISABLED_Speed) {
-  const int test_count = 10000000;
-  size_ = kMaxEdge;
-  bit_depth_ = 12;
-  const int hi = 1 << bit_depth_;
-  for (int i = 0; i < kOffset + size_; ++i) {
-    edge_tst_data_[i] = rng_(hi);
-  }
-  edge_tst_ = &edge_tst_data_[kOffset];
-  for (int iter = 0; iter < test_count; ++iter) {
-    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
-  }
-}
-
-TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
-  const int test_count = 10000000;
-  size_ = kMaxEdge;
-  strength_ = 1;
-  for (int i = 0; i < kOffset + size_; ++i) {
-    edge_tst_data_[i] = rng_.Rand8();
-  }
-  edge_tst_ = &edge_tst_data_[kOffset];
-  for (int iter = 0; iter < test_count; ++iter) {
-    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
-    // iterate over filter strengths (1,2,3)
-    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
-  }
-}
-
 TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
   const int test_count = 10000000;
   size_ = kMaxEdge;
@@ -330,8 +328,24 @@
   for (int iter = 0; iter < test_count; ++iter) {
     API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
     // iterate over filter strengths (1,2,3)
-    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+    strength_ = strength_ == 3 ? 1 : strength_ + 1;
   }
 }
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
+                         ::testing::Values(FilterEdgeTestFuncsHBD(
+                             av1_highbd_filter_intra_edge_c,
+                             av1_highbd_filter_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterEdgeTestHB,
+                         ::testing::Values(FilterEdgeTestFuncsHBD(
+                             av1_highbd_filter_intra_edge_c,
+                             av1_highbd_filter_intra_edge_neon)));
+#endif  // HAVE_NEON
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 }  // namespace
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index aced593..8796e8b 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -155,7 +155,7 @@
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
     stride_ = params_.block_width * 3;
     mask_ = (1 << params_.bit_depth) - 1;
@@ -195,19 +195,19 @@
 #if CONFIG_AV1_HIGHBITDEPTH
 class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
  protected:
-  void Predict() {
+  void Predict() override {
     const int bit_depth = params_.bit_depth;
     params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
     API_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
   }
-  void PredictRefSpeedTest(int num) {
+  void PredictRefSpeedTest(int num) override {
     const int bit_depth = params_.bit_depth;
     for (int i = 0; i < num; i++) {
       params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
     }
   }
-  void PredictFncSpeedTest(int num) {
+  void PredictFncSpeedTest(int num) override {
     const int bit_depth = params_.bit_depth;
     for (int i = 0; i < num; i++) {
       params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth);
@@ -220,17 +220,17 @@
 
 class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
  protected:
-  void Predict() {
+  void Predict() override {
     params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
     API_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_));
   }
-  void PredictRefSpeedTest(int num) {
+  void PredictRefSpeedTest(int num) override {
     for (int i = 0; i < num; i++) {
       params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
     }
   }
-  void PredictFncSpeedTest(int num) {
+  void PredictFncSpeedTest(int num) override {
     for (int i = 0; i < num; i++) {
       params_.pred_fn(dst_, stride_, above_row_, left_col_);
     }
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 63e15ca..791cdb8 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -45,7 +45,7 @@
  protected:
   InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
 
-  virtual ~InvalidFileTest() {
+  ~InvalidFileTest() override {
     if (res_file_ != nullptr) fclose(res_file_);
   }
 
@@ -55,15 +55,14 @@
         << "Result file open failed. Filename: " << res_file_name;
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     const unsigned int /*frame_number*/) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             const unsigned int /*frame_number*/) override {
     EXPECT_NE(img.fb_priv, nullptr);
   }
 
-  virtual bool HandleDecodeResult(
-      const aom_codec_err_t res_dec,
-      const libaom_test::CompressedVideoSource &video,
-      libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          const libaom_test::CompressedVideoSource &video,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_NE(res_file_, nullptr);
     int expected_res_dec = -1;
 
@@ -95,9 +94,9 @@
     return !HasFailure();
   }
 
-  virtual void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
-                                libaom_test::CompressedVideoSource * /*video*/,
-                                const aom_codec_err_t /*res_peek*/) {}
+  void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
+                        libaom_test::CompressedVideoSource * /*video*/,
+                        const aom_codec_err_t /*res_peek*/) override {}
 
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 45828b5..85731f5 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -37,20 +37,20 @@
         compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~IVFVideoSource() {
+  ~IVFVideoSource() override {
     delete[] compressed_frame_buf_;
 
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Init() {
+  void Init() override {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
     ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
     ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
@@ -67,7 +67,7 @@
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
@@ -94,11 +94,11 @@
     }
   }
 
-  virtual const uint8_t *cxdata() const {
+  const uint8_t *cxdata() const override {
     return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
-  virtual size_t frame_size() const { return frame_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return frame_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/kf_test.cc b/test/kf_test.cc
index 5daf600..bc475fd 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc
@@ -47,9 +47,9 @@
     kf_dist_ = -1;
     is_kf_interval_violated_ = false;
   }
-  virtual ~KeyFrameIntervalTestLarge() {}
+  ~KeyFrameIntervalTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -60,18 +60,18 @@
     cfg_.g_lag_in_frames = 19;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -149,9 +149,9 @@
     frame_num_ = 0;
     is_kf_placement_violated_ = false;
   }
-  virtual ~ForcedKeyTestLarge() {}
+  ~ForcedKeyTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     cfg_.rc_end_usage = rc_end_usage_;
     cfg_.g_threads = 0;
@@ -160,8 +160,8 @@
     cfg_.fwd_kf_enabled = fwd_kf_enabled_;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, auto_alt_ref_);
@@ -176,8 +176,8 @@
         ((int)video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       if ((int)frame_num_ == forced_kf_frame_num_) {
diff --git a/test/level_test.cc b/test/level_test.cc
index cc79926..a7c26d2 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -40,9 +40,9 @@
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), target_level_(31) {}
 
-  virtual ~LevelTest() {}
+  ~LevelTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     if (encoding_mode_ != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 5;
@@ -53,8 +53,8 @@
     }
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_);
diff --git a/test/loopfilter_control_test.cc b/test/loopfilter_control_test.cc
index 5f01340..9c00235 100644
--- a/test/loopfilter_control_test.cc
+++ b/test/loopfilter_control_test.cc
@@ -79,9 +79,9 @@
         aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
         tile_columns_(GET_PARAM(5)) {}
 
-  virtual ~LFControlEndToEndTest() {}
+  ~LFControlEndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kRealTime);
 
     cfg_.g_threads = threads_;
@@ -92,18 +92,18 @@
     cfg_.kf_min_dist = 9999;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
       encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index ef4e19f..756ad05 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc
@@ -33,15 +33,15 @@
         encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)),
         cpu_used_(GET_PARAM(3)) {}
 
-  virtual ~LosslessTestLarge() {}
+  ~LosslessTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     cfg_.rc_end_usage = rc_end_usage_;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       // Only call Control if quantizer > 0 to verify that using quantizer
       // alone will activate lossless
@@ -52,19 +52,19 @@
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
   }
 
   double GetMinPsnr() const { return psnr_; }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 421fdef..04b1c86 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -127,16 +127,14 @@
 template <typename func_type_t, typename params_t>
 class LoopTestParam : public ::testing::TestWithParam<params_t> {
  public:
-  virtual ~LoopTestParam() {}
-  virtual void SetUp() {
+  ~LoopTestParam() override = default;
+  void SetUp() override {
     loopfilter_op_ = std::get<0>(this->GetParam());
     ref_loopfilter_op_ = std::get<1>(this->GetParam());
     bit_depth_ = std::get<2>(this->GetParam());
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() {}
-
  protected:
   int bit_depth_;
   int mask_;
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 2ef3e4d..bb03746 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -45,8 +45,8 @@
 
 class MaskedSADTestBase : public ::testing::Test {
  public:
-  virtual ~MaskedSADTestBase() {}
-  virtual void SetUp() = 0;
+  ~MaskedSADTestBase() override = default;
+  void SetUp() override = 0;
   virtual void runRef(const uint8_t *src_ptr, int src_stride,
                       const uint8_t *ref_ptr[], int ref_stride,
                       const uint8_t *second_pred, const uint8_t *msk,
@@ -58,28 +58,26 @@
                        int msk_stride, int inv_mask, unsigned sads[],
                        int times) = 0;
 
-  virtual void TearDown() {}
   void runMaskedSADTest(int run_times);
 };
 
 class MaskedSADTest : public MaskedSADTestBase,
                       public ::testing::WithParamInterface<MaskedSADParam> {
  public:
-  virtual ~MaskedSADTest() {}
-  virtual void SetUp() {
+  ~MaskedSADTest() override = default;
+  void SetUp() override {
     maskedSAD_op_ = GET_PARAM(0);
     ref_maskedSAD_op_ = GET_PARAM(1);
   }
 
-  virtual void runRef(const uint8_t *src_ptr, int src_stride,
-                      const uint8_t *ref_ptr[], int ref_stride,
-                      const uint8_t *second_pred, const uint8_t *msk,
-                      int msk_stride, int inv_mask, unsigned sads[], int times);
-  virtual void runTest(const uint8_t *src_ptr, int src_stride,
-                       const uint8_t *ref_ptr[], int ref_stride,
-                       const uint8_t *second_pred, const uint8_t *msk,
-                       int msk_stride, int inv_mask, unsigned sads[],
-                       int times);
+  void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+              int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+              int msk_stride, int inv_mask, unsigned sads[],
+              int times) override;
+  void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+               int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+               int msk_stride, int inv_mask, unsigned sads[],
+               int times) override;
 
  protected:
   MaskedSADFunc maskedSAD_op_;
@@ -90,20 +88,19 @@
 class MaskedSADx4Test : public MaskedSADTestBase,
                         public ::testing::WithParamInterface<MaskedSADx4Param> {
  public:
-  virtual ~MaskedSADx4Test() {}
-  virtual void SetUp() {
+  ~MaskedSADx4Test() override = default;
+  void SetUp() override {
     maskedSAD_op_ = GET_PARAM(0);
     ref_maskedSAD_op_ = GET_PARAM(1);
   }
-  virtual void runRef(const uint8_t *src_ptr, int src_stride,
-                      const uint8_t *ref_ptr[], int ref_stride,
-                      const uint8_t *second_pred, const uint8_t *msk,
-                      int msk_stride, int inv_mask, unsigned sads[], int times);
-  virtual void runTest(const uint8_t *src_ptr, int src_stride,
-                       const uint8_t *ref_ptr[], int ref_stride,
-                       const uint8_t *second_pred, const uint8_t *msk,
-                       int msk_stride, int inv_mask, unsigned sads[],
-                       int times);
+  void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+              int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+              int msk_stride, int inv_mask, unsigned sads[],
+              int times) override;
+  void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+               int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+               int msk_stride, int inv_mask, unsigned sads[],
+               int times) override;
 
  protected:
   MaskedSADx4Func maskedSAD_op_;
@@ -264,13 +261,12 @@
 class HighbdMaskedSADTest
     : public ::testing::TestWithParam<HighbdMaskedSADParam> {
  public:
-  virtual ~HighbdMaskedSADTest() {}
-  virtual void SetUp() {
+  ~HighbdMaskedSADTest() override = default;
+  void SetUp() override {
     maskedSAD_op_ = GET_PARAM(0);
     ref_maskedSAD_op_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() {}
   void runHighbdMaskedSADTest(int run_times);
 
  protected:
@@ -581,6 +577,41 @@
 
 INSTANTIATE_TEST_SUITE_P(NEON, MaskedSADx4Test,
                          ::testing::ValuesIn(msadx4_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const MaskedSADParam hbd_msad_neon_test[] = {
+  make_tuple(&aom_highbd_masked_sad4x4_neon, &aom_highbd_masked_sad4x4_c),
+  make_tuple(&aom_highbd_masked_sad4x8_neon, &aom_highbd_masked_sad4x8_c),
+  make_tuple(&aom_highbd_masked_sad8x4_neon, &aom_highbd_masked_sad8x4_c),
+  make_tuple(&aom_highbd_masked_sad8x8_neon, &aom_highbd_masked_sad8x8_c),
+  make_tuple(&aom_highbd_masked_sad8x16_neon, &aom_highbd_masked_sad8x16_c),
+  make_tuple(&aom_highbd_masked_sad16x8_neon, &aom_highbd_masked_sad16x8_c),
+  make_tuple(&aom_highbd_masked_sad16x16_neon, &aom_highbd_masked_sad16x16_c),
+  make_tuple(&aom_highbd_masked_sad16x32_neon, &aom_highbd_masked_sad16x32_c),
+  make_tuple(&aom_highbd_masked_sad32x16_neon, &aom_highbd_masked_sad32x16_c),
+  make_tuple(&aom_highbd_masked_sad32x32_neon, &aom_highbd_masked_sad32x32_c),
+  make_tuple(&aom_highbd_masked_sad32x64_neon, &aom_highbd_masked_sad32x64_c),
+  make_tuple(&aom_highbd_masked_sad64x32_neon, &aom_highbd_masked_sad64x32_c),
+  make_tuple(&aom_highbd_masked_sad64x64_neon, &aom_highbd_masked_sad64x64_c),
+  make_tuple(&aom_highbd_masked_sad64x128_neon, &aom_highbd_masked_sad64x128_c),
+  make_tuple(&aom_highbd_masked_sad128x64_neon, &aom_highbd_masked_sad128x64_c),
+  make_tuple(&aom_highbd_masked_sad128x128_neon,
+             &aom_highbd_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(&aom_highbd_masked_sad4x16_neon, &aom_highbd_masked_sad4x16_c),
+  make_tuple(&aom_highbd_masked_sad16x4_neon, &aom_highbd_masked_sad16x4_c),
+  make_tuple(&aom_highbd_masked_sad8x32_neon, &aom_highbd_masked_sad8x32_c),
+  make_tuple(&aom_highbd_masked_sad32x8_neon, &aom_highbd_masked_sad32x8_c),
+  make_tuple(&aom_highbd_masked_sad16x64_neon, &aom_highbd_masked_sad16x64_c),
+  make_tuple(&aom_highbd_masked_sad64x16_neon, &aom_highbd_masked_sad64x16_c),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdMaskedSADTest,
+                         ::testing::ValuesIn(hbd_msad_neon_test));
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 #endif  // HAVE_NEON
 
 }  // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index e76403e..8482a12 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -43,14 +43,12 @@
 class MaskedSubPixelVarianceTest
     : public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
  public:
-  virtual ~MaskedSubPixelVarianceTest() {}
-  virtual void SetUp() {
+  ~MaskedSubPixelVarianceTest() override = default;
+  void SetUp() override {
     opt_func_ = GET_PARAM(0);
     ref_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() {}
-
  protected:
   MaskedSubPixelVarianceFunc opt_func_;
   MaskedSubPixelVarianceFunc ref_func_;
@@ -179,15 +177,13 @@
 class HighbdMaskedSubPixelVarianceTest
     : public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
  public:
-  virtual ~HighbdMaskedSubPixelVarianceTest() {}
-  virtual void SetUp() {
+  ~HighbdMaskedSubPixelVarianceTest() override = default;
+  void SetUp() override {
     opt_func_ = GET_PARAM(0);
     ref_func_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() {}
-
  protected:
   MaskedSubPixelVarianceFunc opt_func_;
   MaskedSubPixelVarianceFunc ref_func_;
@@ -568,5 +564,149 @@
 
 INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, MaskedSubPixelVarianceTest,
                          ::testing::ValuesIn(sub_pel_var_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test_neon[] = {
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_neon,
+             &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_neon,
+             &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_neon,
+             &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_neon,
+             &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_neon,
+             &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_neon,
+             &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_neon,
+             &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_neon,
+             &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_neon,
+             &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_neon,
+             &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_neon,
+             &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_neon,
+             &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_neon,
+             &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_neon,
+             &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_neon,
+             &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_neon,
+             &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_neon,
+             &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_neon,
+             &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_neon,
+             &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_neon,
+             &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_neon,
+             &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_neon,
+             &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_neon,
+             &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_neon,
+             &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_neon,
+             &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_neon,
+             &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_neon,
+             &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_neon,
+             &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_neon,
+             &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_neon,
+             &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_neon,
+             &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_neon,
+             &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_neon,
+             &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_neon,
+             &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_neon,
+             &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_neon,
+             &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_neon,
+             &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_neon,
+             &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_neon,
+             &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_neon,
+             &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_neon,
+             &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_neon,
+             &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_neon,
+             &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_neon,
+             &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_neon,
+             &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_neon,
+             &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_neon,
+             &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_neon,
+             &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_neon,
+             &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_neon,
+             &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_neon,
+             &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_neon,
+             &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_neon,
+             &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_neon,
+             &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_neon,
+             &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_neon,
+             &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_neon,
+             &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_neon,
+             &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_neon,
+             &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_neon,
+             &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_neon,
+             &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_neon,
+             &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_neon,
+             &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_neon,
+             &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_neon,
+             &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_neon,
+             &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+                         ::testing::ValuesIn(hbd_sub_pel_var_test_neon));
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 #endif  // HAVE_NEON
 }  // namespace
diff --git a/test/metadata_test.cc b/test/metadata_test.cc
index e77e5d1..9467c29 100644
--- a/test/metadata_test.cc
+++ b/test/metadata_test.cc
@@ -56,12 +56,12 @@
  protected:
   MetadataEncodeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~MetadataEncodeTest() {}
+  ~MetadataEncodeTest() override = default;
 
-  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+  void SetUp() override { InitializeConfig(GET_PARAM(1)); }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder * /*encoder*/) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder * /*encoder*/) override {
     aom_image_t *current_frame = video->img();
     if (current_frame) {
       if (current_frame->metadata) aom_img_remove_metadata(current_frame);
@@ -95,7 +95,7 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
       const size_t bitstream_size = pkt->data.frame.sz;
       const uint8_t *bitstream =
@@ -138,8 +138,8 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t /*pts*/) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t /*pts*/) override {
     ASSERT_NE(img.metadata, nullptr);
 
     ASSERT_EQ(img.metadata->sz, 3u);
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index cf67b7b..33be4ff 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -31,7 +31,7 @@
 
 class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     mm_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index c5229fc..f22b5fe 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc
@@ -42,12 +42,12 @@
       : EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)),
         frame0_psnr_y_(0.0) {}
 
-  virtual ~MonochromeTest() {}
+  ~MonochromeTest() override = default;
 
-  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+  void SetUp() override { InitializeConfig(GET_PARAM(1)); }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3));
       if (mode_ == ::libaom_test::kAllIntra) {
@@ -59,8 +59,8 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     (void)pts;
 
     // Get value of top-left corner pixel of U plane
@@ -96,7 +96,7 @@
     return true;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     // Check average PSNR value is >= 100 db in case of lossless encoding.
     if (lossless_) {
       EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
diff --git a/test/motion_vector_test.cc b/test/motion_vector_test.cc
index bf10ede..4fc8d53 100644
--- a/test/motion_vector_test.cc
+++ b/test/motion_vector_test.cc
@@ -43,9 +43,9 @@
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
 
-  virtual ~MotionVectorTestLarge() {}
+  ~MotionVectorTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     if (encoding_mode_ != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 3;
@@ -56,8 +56,8 @@
     }
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
diff --git a/test/mv_cost_test.cc b/test/mv_cost_test.cc
index 86e310c..73d5666 100644
--- a/test/mv_cost_test.cc
+++ b/test/mv_cost_test.cc
@@ -23,18 +23,19 @@
   int bits_cost[MV_OFFSET_BITS][2];
   int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
-  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
-  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
-  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, nullptr);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, nullptr);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, nullptr);
   for (i = 0; i < MV_OFFSET_BITS; ++i) {
-    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], nullptr);
   }
   for (i = 0; i < CLASS0_SIZE; ++i)
-    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
-  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+                             nullptr);
+  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, nullptr);
   if (precision > MV_SUBPEL_LOW_PRECISION) {
-    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
-    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, nullptr);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, nullptr);
   }
   mvcost[0] = 0;
   for (v = 1; v <= MV_MAX; ++v) {
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 650af79..b3edcc2 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -24,7 +24,7 @@
 
 // Return normally distrbuted values with standard deviation of sigma.
 double randn(libaom_test::ACMRandom *random, double sigma) {
-  while (1) {
+  while (true) {
     const double u = 2.0 * ((double)random->Rand31() /
                             testing::internal::Random::kMaxRange) -
                      1.0;
@@ -367,7 +367,7 @@
 template <typename T>
 class FlatBlockEstimatorTest : public ::testing::Test, public T {
  public:
-  virtual void SetUp() { random_.Reset(171); }
+  void SetUp() override { random_.Reset(171); }
   typedef std::vector<typename T::data_type_t> VecType;
   VecType data_;
   libaom_test::ACMRandom random_;
@@ -544,7 +544,7 @@
   static const int kNumBlocksX = kWidth / kBlockSize;
   static const int kNumBlocksY = kHeight / kBlockSize;
 
-  virtual void SetUp() {
+  void SetUp() override {
     const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
                                               T::kBitDepth, T::kUseHighBD };
     ASSERT_TRUE(aom_noise_model_init(&model_, params));
@@ -576,7 +576,7 @@
                                   &flat_blocks_[0], block_size);
   }
 
-  void TearDown() { aom_noise_model_free(&model_); }
+  void TearDown() override { aom_noise_model_free(&model_); }
 
  protected:
   aom_noise_model_t model_;
@@ -1186,7 +1186,7 @@
   static void SetUpTestSuite() { aom_dsp_rtcd(); }
 
  protected:
-  void SetUp() {
+  void SetUp() override {
     static const float kNoiseLevel = 5.f;
     static const float kStd = 4.0;
     static const double kMaxValue = (1 << T::kBitDepth) - 1;
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index 8d13ac1..967b677 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc
@@ -236,6 +236,38 @@
   }
 }
 
+#if HAVE_NEON
+ObmcSadHBDTest::ParamType neon_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_neon),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_neon),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_neon),
+  TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_neon),
+  TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_neon),
+  TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_neon),
+  TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_neon),
+  TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_neon),
+  TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_neon),
+  TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_neon),
+  TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_neon),
+  TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_neon),
+  TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_neon),
+  TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_neon),
+  TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_neon),
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_neon),
+#if !CONFIG_REALTIME_ONLY
+  TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_neon),
+  TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_neon),
+  TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_neon),
+  TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_neon),
+  TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_neon),
+  TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_neon),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcSadHBDTest,
+                         ::testing::ValuesIn(neon_functions_hbd));
+#endif  // HAVE_NEON
+
 #if HAVE_SSE4_1
 ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
   TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index b2bf42a..5f21a8a 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc
@@ -228,7 +228,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
-#if CONFIG_AV1_HIGHBITDEPTH
+#if CONFIG_AV1_HIGHBITDEPTH && !CONFIG_REALTIME_ONLY
 class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceHBDTest);
 
@@ -287,40 +287,180 @@
   }
 }
 
+#if HAVE_NEON
+ObmcVarianceHBDTest::ParamType neon_functions_hbd[] = {
+  TestFuncs(aom_highbd_8_obmc_variance128x128_c,
+            aom_highbd_8_obmc_variance128x128_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance128x64_c,
+            aom_highbd_8_obmc_variance128x64_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance64x128_c,
+            aom_highbd_8_obmc_variance64x128_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance64x64_c,
+            aom_highbd_8_obmc_variance64x64_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance64x32_c,
+            aom_highbd_8_obmc_variance64x32_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x64_c,
+            aom_highbd_8_obmc_variance32x64_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x32_c,
+            aom_highbd_8_obmc_variance32x32_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x16_c,
+            aom_highbd_8_obmc_variance32x16_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x32_c,
+            aom_highbd_8_obmc_variance16x32_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x16_c,
+            aom_highbd_8_obmc_variance16x16_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x8_c,
+            aom_highbd_8_obmc_variance16x8_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x16_c,
+            aom_highbd_8_obmc_variance8x16_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x8_c, aom_highbd_8_obmc_variance8x8_neon,
+            8),
+  TestFuncs(aom_highbd_8_obmc_variance8x4_c, aom_highbd_8_obmc_variance8x4_neon,
+            8),
+  TestFuncs(aom_highbd_8_obmc_variance4x8_c, aom_highbd_8_obmc_variance4x8_neon,
+            8),
+  TestFuncs(aom_highbd_8_obmc_variance4x4_c, aom_highbd_8_obmc_variance4x4_neon,
+            8),
+  TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+            aom_highbd_10_obmc_variance128x128_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+            aom_highbd_10_obmc_variance128x64_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+            aom_highbd_10_obmc_variance64x128_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x64_c,
+            aom_highbd_10_obmc_variance64x64_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x32_c,
+            aom_highbd_10_obmc_variance64x32_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x64_c,
+            aom_highbd_10_obmc_variance32x64_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x32_c,
+            aom_highbd_10_obmc_variance32x32_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x16_c,
+            aom_highbd_10_obmc_variance32x16_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x32_c,
+            aom_highbd_10_obmc_variance16x32_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x16_c,
+            aom_highbd_10_obmc_variance16x16_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x8_c,
+            aom_highbd_10_obmc_variance16x8_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x16_c,
+            aom_highbd_10_obmc_variance8x16_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x8_c,
+            aom_highbd_10_obmc_variance8x8_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x4_c,
+            aom_highbd_10_obmc_variance8x4_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x8_c,
+            aom_highbd_10_obmc_variance4x8_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x4_c,
+            aom_highbd_10_obmc_variance4x4_neon, 10),
+  TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+            aom_highbd_12_obmc_variance128x128_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+            aom_highbd_12_obmc_variance128x64_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+            aom_highbd_12_obmc_variance64x128_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x64_c,
+            aom_highbd_12_obmc_variance64x64_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x32_c,
+            aom_highbd_12_obmc_variance64x32_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x64_c,
+            aom_highbd_12_obmc_variance32x64_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x32_c,
+            aom_highbd_12_obmc_variance32x32_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x16_c,
+            aom_highbd_12_obmc_variance32x16_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x32_c,
+            aom_highbd_12_obmc_variance16x32_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x16_c,
+            aom_highbd_12_obmc_variance16x16_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x8_c,
+            aom_highbd_12_obmc_variance16x8_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x16_c,
+            aom_highbd_12_obmc_variance8x16_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x8_c,
+            aom_highbd_12_obmc_variance8x8_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x4_c,
+            aom_highbd_12_obmc_variance8x4_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x8_c,
+            aom_highbd_12_obmc_variance4x8_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x4_c,
+            aom_highbd_12_obmc_variance4x4_neon, 12),
+  TestFuncs(aom_highbd_8_obmc_variance64x16_c,
+            aom_highbd_8_obmc_variance64x16_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x64_c,
+            aom_highbd_8_obmc_variance16x64_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x8_c,
+            aom_highbd_8_obmc_variance32x8_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x32_c,
+            aom_highbd_8_obmc_variance8x32_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x4_c,
+            aom_highbd_8_obmc_variance16x4_neon, 8),
+  TestFuncs(aom_highbd_8_obmc_variance4x16_c,
+            aom_highbd_8_obmc_variance4x16_neon, 8),
+  TestFuncs(aom_highbd_10_obmc_variance64x16_c,
+            aom_highbd_10_obmc_variance64x16_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x64_c,
+            aom_highbd_10_obmc_variance16x64_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x8_c,
+            aom_highbd_10_obmc_variance32x8_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x32_c,
+            aom_highbd_10_obmc_variance8x32_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x4_c,
+            aom_highbd_10_obmc_variance16x4_neon, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x16_c,
+            aom_highbd_10_obmc_variance4x16_neon, 10),
+  TestFuncs(aom_highbd_12_obmc_variance64x16_c,
+            aom_highbd_12_obmc_variance64x16_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x64_c,
+            aom_highbd_12_obmc_variance16x64_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x8_c,
+            aom_highbd_12_obmc_variance32x8_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x32_c,
+            aom_highbd_12_obmc_variance8x32_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x4_c,
+            aom_highbd_12_obmc_variance16x4_neon, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x16_c,
+            aom_highbd_12_obmc_variance4x16_neon, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcVarianceHBDTest,
+                         ::testing::ValuesIn(neon_functions_hbd));
+#endif  // HAVE_NEON
+
 #if HAVE_SSE4_1
 ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
-  TestFuncs(aom_highbd_obmc_variance128x128_c,
-            aom_highbd_obmc_variance128x128_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance128x64_c,
-            aom_highbd_obmc_variance128x64_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance64x128_c,
-            aom_highbd_obmc_variance64x128_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance64x64_c,
-            aom_highbd_obmc_variance64x64_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance64x32_c,
-            aom_highbd_obmc_variance64x32_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance32x64_c,
-            aom_highbd_obmc_variance32x64_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance32x32_c,
-            aom_highbd_obmc_variance32x32_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance32x16_c,
-            aom_highbd_obmc_variance32x16_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance16x32_c,
-            aom_highbd_obmc_variance16x32_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance16x16_c,
-            aom_highbd_obmc_variance16x16_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance16x8_c, aom_highbd_obmc_variance16x8_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance8x16_c, aom_highbd_obmc_variance8x16_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance8x8_c, aom_highbd_obmc_variance8x8_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance8x4_c, aom_highbd_obmc_variance8x4_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance4x8_c, aom_highbd_obmc_variance4x8_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1,
-            8),
+  TestFuncs(aom_highbd_8_obmc_variance128x128_c,
+            aom_highbd_8_obmc_variance128x128_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance128x64_c,
+            aom_highbd_8_obmc_variance128x64_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance64x128_c,
+            aom_highbd_8_obmc_variance64x128_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance64x64_c,
+            aom_highbd_8_obmc_variance64x64_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance64x32_c,
+            aom_highbd_8_obmc_variance64x32_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x64_c,
+            aom_highbd_8_obmc_variance32x64_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x32_c,
+            aom_highbd_8_obmc_variance32x32_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x16_c,
+            aom_highbd_8_obmc_variance32x16_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x32_c,
+            aom_highbd_8_obmc_variance16x32_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x16_c,
+            aom_highbd_8_obmc_variance16x16_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x8_c,
+            aom_highbd_8_obmc_variance16x8_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x16_c,
+            aom_highbd_8_obmc_variance8x16_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x8_c,
+            aom_highbd_8_obmc_variance8x8_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x4_c,
+            aom_highbd_8_obmc_variance8x4_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance4x8_c,
+            aom_highbd_8_obmc_variance4x8_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance4x4_c,
+            aom_highbd_8_obmc_variance4x4_sse4_1, 8),
   TestFuncs(aom_highbd_10_obmc_variance128x128_c,
             aom_highbd_10_obmc_variance128x128_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance128x64_c,
@@ -386,18 +526,18 @@
   TestFuncs(aom_highbd_12_obmc_variance4x4_c,
             aom_highbd_12_obmc_variance4x4_sse4_1, 12),
 
-  TestFuncs(aom_highbd_obmc_variance64x16_c,
-            aom_highbd_obmc_variance64x16_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance16x64_c,
-            aom_highbd_obmc_variance16x64_sse4_1, 8),
-  TestFuncs(aom_highbd_obmc_variance32x8_c, aom_highbd_obmc_variance32x8_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance8x32_c, aom_highbd_obmc_variance8x32_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance16x4_c, aom_highbd_obmc_variance16x4_sse4_1,
-            8),
-  TestFuncs(aom_highbd_obmc_variance4x16_c, aom_highbd_obmc_variance4x16_sse4_1,
-            8),
+  TestFuncs(aom_highbd_8_obmc_variance64x16_c,
+            aom_highbd_8_obmc_variance64x16_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x64_c,
+            aom_highbd_8_obmc_variance16x64_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance32x8_c,
+            aom_highbd_8_obmc_variance32x8_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance8x32_c,
+            aom_highbd_8_obmc_variance8x32_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance16x4_c,
+            aom_highbd_8_obmc_variance16x4_sse4_1, 8),
+  TestFuncs(aom_highbd_8_obmc_variance4x16_c,
+            aom_highbd_8_obmc_variance4x16_sse4_1, 8),
   TestFuncs(aom_highbd_10_obmc_variance64x16_c,
             aom_highbd_10_obmc_variance64x16_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance16x64_c,
@@ -427,5 +567,5 @@
 INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceHBDTest,
                          ::testing::ValuesIn(sse4_functions_hbd));
 #endif  // HAVE_SSE4_1
-#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // CONFIG_AV1_HIGHBITDEPTH && !CONFIG_REALTIME_ONLY
 }  // namespace
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 131e1dd..534d9b1 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -43,7 +43,7 @@
 class PixelProjErrorTest
     : public ::testing::TestWithParam<PixelProjErrorTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     target_func_ = GET_PARAM(0);
     src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
                                   sizeof(*src_)));
@@ -58,7 +58,7 @@
                                    sizeof(*flt1_)));
     ASSERT_NE(flt1_, nullptr);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dgd_);
     aom_free(flt0_);
@@ -215,7 +215,7 @@
 class PixelProjHighbdErrorTest
     : public ::testing::TestWithParam<PixelProjErrorTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     target_func_ = GET_PARAM(0);
     src_ =
         (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_));
@@ -230,7 +230,7 @@
         (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt1_));
     ASSERT_NE(flt1_, nullptr);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dgd_);
     aom_free(flt0_);
@@ -386,7 +386,7 @@
 class GetProjSubspaceTest
     : public ::testing::TestWithParam<GetProjSubspaceTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     target_func_ = GET_PARAM(0);
     src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
                                   sizeof(*src_)));
@@ -401,7 +401,7 @@
                                    sizeof(*flt1_)));
     ASSERT_NE(flt1_, nullptr);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dgd_);
     aom_free(flt0_);
@@ -432,7 +432,9 @@
   const int flt0_stride = MAX_DATA_BLOCK;
   const int flt1_stride = MAX_DATA_BLOCK;
   sgr_params_type params;
-  const int iters = run_times == 1 ? kIterations : 4;
+  const int iters = run_times == 1 ? kIterations : 3;
+  static constexpr int kR0[3] = { 1, 1, 0 };
+  static constexpr int kR1[3] = { 1, 0, 1 };
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
     int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -444,10 +446,8 @@
       flt1_[i] = rng_.Rand15Signed();
     }
 
-    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
-    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
-    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
-    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR0[iter];
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR1[iter];
     uint8_t *dgd = dgd_;
     uint8_t *src = src_;
 
@@ -492,6 +492,8 @@
   const int flt1_stride = MAX_DATA_BLOCK;
   sgr_params_type params;
   const int iters = kIterations;
+  static constexpr int kR0[3] = { 1, 1, 0 };
+  static constexpr int kR1[3] = { 1, 0, 1 };
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
     int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -502,10 +504,8 @@
       flt0_[i] = rng_.Rand15Signed();
       flt1_[i] = rng_.Rand15Signed();
     }
-    params.r[0] = 1;
-    params.r[1] = 1;
-    params.s[0] = rng_.Rand8() % MAX_RADIUS;
-    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    params.r[0] = kR0[iter % 3];
+    params.r[1] = kR1[iter % 3];
     uint8_t *dgd = dgd_;
     uint8_t *src = src_;
 
@@ -546,6 +546,12 @@
                          ::testing::Values(av1_calc_proj_params_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTest,
+                         ::testing::Values(av1_calc_proj_params_neon));
+#endif  // HAVE_NEON
+
 }  // namespace get_proj_subspace_test_lowbd
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -565,7 +571,7 @@
 class GetProjSubspaceTestHBD
     : public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     target_func_ = GET_PARAM(0);
     src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
                                    sizeof(*src_)));
@@ -580,7 +586,7 @@
                                    sizeof(*flt1_)));
     ASSERT_NE(flt1_, nullptr);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dgd_);
     aom_free(flt0_);
@@ -611,7 +617,9 @@
   const int flt0_stride = MAX_DATA_BLOCK;
   const int flt1_stride = MAX_DATA_BLOCK;
   sgr_params_type params;
-  const int iters = run_times == 1 ? kIterations : 4;
+  const int iters = run_times == 1 ? kIterations : 3;
+  static constexpr int kR0[3] = { 1, 1, 0 };
+  static constexpr int kR1[3] = { 1, 0, 1 };
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
     int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -623,10 +631,8 @@
       flt1_[i] = rng_.Rand15Signed();
     }
 
-    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
-    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
-    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
-    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR0[iter];
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR1[iter];
     uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
     uint8_t *src = CONVERT_TO_BYTEPTR(src_);
 
@@ -671,6 +677,8 @@
   const int flt1_stride = MAX_DATA_BLOCK;
   sgr_params_type params;
   const int iters = kIterations;
+  static constexpr int kR0[3] = { 1, 1, 0 };
+  static constexpr int kR1[3] = { 1, 0, 1 };
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
     int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
@@ -681,10 +689,8 @@
       flt0_[i] = rng_.Rand15Signed();
       flt1_[i] = rng_.Rand15Signed();
     }
-    params.r[0] = 1;
-    params.r[1] = 1;
-    params.s[0] = rng_.Rand8() % MAX_RADIUS;
-    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    params.r[0] = kR0[iter % 3];
+    params.r[1] = kR1[iter % 3];
     uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
     uint8_t *src = CONVERT_TO_BYTEPTR(src_);
 
@@ -728,6 +734,11 @@
                          ::testing::Values(av1_calc_proj_params_high_bd_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTestHBD,
+                         ::testing::Values(av1_calc_proj_params_high_bd_neon));
+#endif  // HAVE_NEON
 }  // namespace get_proj_subspace_test_hbd
 
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/postproc_filters_test.cc b/test/postproc_filters_test.cc
index 37de5d2..9584dd8 100644
--- a/test/postproc_filters_test.cc
+++ b/test/postproc_filters_test.cc
@@ -30,13 +30,13 @@
       : EncoderTest(GET_PARAM(0)), set_skip_postproc_filtering_(false),
         frame_number_(0), cpu_used_(GET_PARAM(1)), bd_(GET_PARAM(2)) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kAllIntra);
     cfg_.g_input_bit_depth = bd_;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     frame_number_ = video->frame();
     if (frame_number_ == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
@@ -53,14 +53,14 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     ::libaom_test::MD5 md5_enc;
     md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
                 pkt->data.frame.sz);
     md5_enc_.push_back(md5_enc.Get());
   }
 
-  virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
     const aom_image_t *img_enc = encoder->GetPreviewFrame();
     if (!set_skip_postproc_filtering_) {
       ASSERT_NE(img_enc, nullptr);
diff --git a/test/quant_test.cc b/test/quant_test.cc
index a042af1..afbabb3 100644
--- a/test/quant_test.cc
+++ b/test/quant_test.cc
@@ -32,15 +32,15 @@
       public ::libaom_test::EncoderTest {
  protected:
   QMTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~QMTest() {}
+  ~QMTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_ENABLE_QM, 1);
@@ -119,9 +119,9 @@
         quant_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
     quant_bound_violated_ = false;
   }
-  virtual ~QuantizerBoundsCheckTestLarge() {}
+  ~QuantizerBoundsCheckTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -135,17 +135,17 @@
     }
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 04e8306..328d5b1 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -100,9 +100,9 @@
         tx_size_(GET_TEMPLATE_PARAM(2)), type_(GET_TEMPLATE_PARAM(3)),
         bd_(GET_TEMPLATE_PARAM(4)) {}
 
-  virtual ~QuantizeTestBase() {}
+  ~QuantizeTestBase() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
     ASSERT_NE(qtab_, nullptr);
     const int n_coeffs = coeff_num();
@@ -112,7 +112,7 @@
     InitQuantizer();
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(qtab_);
     qtab_ = nullptr;
     aom_free(coeff_);
@@ -149,8 +149,8 @@
     // Testing uses luminance quantization table
     const int16_t *zbin = qtab_->quant.y_zbin[q];
 
-    const int16_t *round = 0;
-    const int16_t *quant = 0;
+    const int16_t *round = nullptr;
+    const int16_t *quant = nullptr;
     if (type_ == TYPE_B) {
       round = qtab_->quant.y_round[q];
       quant = qtab_->quant.y_quant[q];
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
index 0d8d48f..cc054b6 100644
--- a/test/ratectrl_rtc_test.cc
+++ b/test/ratectrl_rtc_test.cc
@@ -36,12 +36,14 @@
   RcInterfaceTest()
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
         encoder_exit_(false), layer_frame_cnt_(0), superframe_cnt_(0),
-        dynamic_temporal_layers_(false), dynamic_spatial_layers_(false) {
+        frame_cnt_(0), dynamic_temporal_layers_(false),
+        dynamic_spatial_layers_(false), num_drops_(0), max_consec_drop_(0),
+        frame_drop_thresh_(0) {
     memset(&svc_params_, 0, sizeof(svc_params_));
     memset(&layer_id_, 0, sizeof(layer_id_));
   }
 
-  ~RcInterfaceTest() override {}
+  ~RcInterfaceTest() override = default;
 
  protected:
   void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
@@ -57,10 +59,15 @@
     if (video->frame() == 0 && layer_frame_cnt_ == 0) {
       encoder->Control(AOME_SET_CPUUSED, 7);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
-      encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      if (rc_cfg_.is_screen) {
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      } else {
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      }
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT,
                        rc_cfg_.max_intra_bitrate_pct);
       if (use_svc) encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      encoder->Control(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, max_consec_drop_);
     }
     // SVC specific settings
     if (use_svc) {
@@ -140,20 +147,24 @@
       return;
     }
     layer_frame_cnt_++;
+    frame_cnt_++;
     if (layer_id_.spatial_layer_id == rc_cfg_.ss_number_layers - 1)
       superframe_cnt_++;
     int qp;
     encoder->Control(AOME_GET_LAST_QUANTIZER, &qp);
-    rc_api_->ComputeQP(frame_params_);
-    ASSERT_EQ(rc_api_->GetQP(), qp);
-    int encoder_lpf_level;
-    encoder->Control(AOME_GET_LOOPFILTER_LEVEL, &encoder_lpf_level);
-    aom::AV1LoopfilterLevel loopfilter_level = rc_api_->GetLoopfilterLevel();
-    ASSERT_EQ(loopfilter_level.filter_level[0], encoder_lpf_level);
-    aom::AV1CdefInfo cdef_level = rc_api_->GetCdefInfo();
-    int cdef_y_strengths[16];
-    encoder->Control(AV1E_GET_LUMA_CDEF_STRENGTH, cdef_y_strengths);
-    ASSERT_EQ(cdef_level.cdef_strength_y, cdef_y_strengths[0]);
+    if (rc_api_->ComputeQP(frame_params_) == aom::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp) << "at frame " << frame_cnt_ - 1;
+      int encoder_lpf_level;
+      encoder->Control(AOME_GET_LOOPFILTER_LEVEL, &encoder_lpf_level);
+      aom::AV1LoopfilterLevel loopfilter_level = rc_api_->GetLoopfilterLevel();
+      ASSERT_EQ(loopfilter_level.filter_level[0], encoder_lpf_level);
+      aom::AV1CdefInfo cdef_level = rc_api_->GetCdefInfo();
+      int cdef_y_strengths[16];
+      encoder->Control(AV1E_GET_LUMA_CDEF_STRENGTH, cdef_y_strengths);
+      ASSERT_EQ(cdef_level.cdef_strength_y, cdef_y_strengths[0]);
+    } else {
+      num_drops_++;
+    }
   }
 
   void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
@@ -181,6 +192,43 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunOneLayerScreen() {
+    key_interval_ = 10000;
+    SetConfig();
+    rc_cfg_.is_screen = true;
+    rc_cfg_.width = 352;
+    rc_cfg_.height = 288;
+    rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFramesCBR() {
+    key_interval_ = 10000;
+    max_consec_drop_ = 8;
+    frame_drop_thresh_ = 30;
+    SetConfig();
+    rc_cfg_.target_bandwidth = 100;
+    cfg_.rc_target_bitrate = 100;
+    rc_cfg_.max_quantizer = 50;
+    cfg_.rc_max_quantizer = 50;
+    rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
   void RunOneLayerPeriodicKey() {
     key_interval_ = 100;
     SetConfig();
@@ -270,6 +318,8 @@
     rc_cfg_.max_quantizers[0] = 52;
     rc_cfg_.min_quantizers[0] = 2;
     rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    rc_cfg_.max_consec_drop = max_consec_drop_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = 640;
@@ -288,6 +338,7 @@
     cfg_.rc_target_bitrate = 1000;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
   }
 
   void SetConfigSvc(int number_spatial_layers, int number_temporal_layers) {
@@ -425,14 +476,22 @@
   aom_svc_layer_id_t layer_id_;
   int layer_frame_cnt_;
   int superframe_cnt_;
+  int frame_cnt_;
   bool dynamic_temporal_layers_;
   bool dynamic_spatial_layers_;
+  int num_drops_;
+  int max_consec_drop_;
+  int frame_drop_thresh_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
 TEST_P(RcInterfaceTest, OneLayerPeriodicKey) { RunOneLayerPeriodicKey(); }
 
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
 TEST_P(RcInterfaceTest, Svc) { RunSvc(); }
 
 TEST_P(RcInterfaceTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index b45b7bb..ee1a989 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -28,18 +28,18 @@
 namespace {
 using libaom_test::ACMRandom;
 
-typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask,
-                                           DIFFWTD_MASK_TYPE mask_type,
-                                           const uint8_t *src0, int src0_stride,
-                                           const uint8_t *src1, int src1_stride,
-                                           int h, int w);
+using BuildCompDiffWtdMaskFunc = void (*)(uint8_t *mask,
+                                          DIFFWTD_MASK_TYPE mask_type,
+                                          const uint8_t *src0, int src0_stride,
+                                          const uint8_t *src1, int src1_stride,
+                                          int h, int w);
 
-typedef std::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
-    BuildCompDiffwtdMaskDParam;
+using BuildCompDiffwtdMaskDParam =
+    std::tuple<BLOCK_SIZE, BuildCompDiffWtdMaskFunc>;
 
-#if HAVE_SSE4_1
+#if HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON
 ::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
-    buildcompdiffwtdmaskd_func filter) {
+    BuildCompDiffWtdMaskFunc filter) {
   return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
                             ::testing::Values(filter));
 }
@@ -48,175 +48,53 @@
 class BuildCompDiffwtdMaskTest
     : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
  public:
-  virtual ~BuildCompDiffwtdMaskTest() {}
+  BuildCompDiffwtdMaskTest() : rnd_(ACMRandom::DeterministicSeed()) {}
+  ~BuildCompDiffwtdMaskTest() override = default;
 
-  virtual void TearDown() {}
-  void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
-               const DIFFWTD_MASK_TYPE type);
+ protected:
+  void RunTest(BuildCompDiffWtdMaskFunc test_impl, bool is_speed,
+               const DIFFWTD_MASK_TYPE type) {
+    const int sb_type = GET_PARAM(0);
+    const int width = block_size_wide[sb_type];
+    const int height = block_size_high[sb_type];
+    DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
+    for (int i = 0; i < width * height; i++) {
+      src0[i] = rnd_.Rand8();
+      src1[i] = rnd_.Rand8();
+    }
+    const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1,
+                                        width, height, width);
+    }
+    const double t1 = get_time_mark(&timer);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      test_impl(mask_test, type, src0, width, src1, width, height, width);
+    }
+    const double t2 = get_time_mark(&timer);
+    if (is_speed) {
+      printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+      printf("(%3.2f)\n", t1 / t2);
+    }
+    for (int r = 0; r < height; ++r) {
+      for (int c = 0; c < width; ++c) {
+        ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+            << "[" << r << "," << c << "] " << run_times << " @ " << width
+            << "x" << height << " inv " << type;
+      }
+    }
+  }
 
  private:
   ACMRandom rnd_;
 };
-
-typedef void (*buildcompdiffwtdmaskd16_func)(
-    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
-    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
-    ConvolveParams *conv_params, int bd);
-
-typedef std::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
-    BuildCompDiffwtdMaskD16Param;
-
-#if HAVE_SSE4_1 || HAVE_NEON
-::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
-    buildcompdiffwtdmaskd16_func filter) {
-  return ::testing::Combine(::testing::Range(8, 13, 2),
-                            ::testing::Values(filter),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-#endif
-class BuildCompDiffwtdMaskD16Test
-    : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
- public:
-  ~BuildCompDiffwtdMaskD16Test() {}
-  virtual void TearDown() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
- protected:
-  void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
-  void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
-                    DIFFWTD_MASK_TYPE mask_type);
-  libaom_test::ACMRandom rnd_;
-};  // class BuildCompDiffwtdMaskD16Test
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
-
-void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
-    buildcompdiffwtdmaskd16_func test_impl) {
-  const int block_idx = GET_PARAM(2);
-  const int bd = GET_PARAM(0);
-  const int width = block_size_wide[block_idx];
-  const int height = block_size_high[block_idx];
-  DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
-
-  int in_precision =
-      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
-
-  for (int i = 0; i < MAX_SB_SQUARE; i++) {
-    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
-    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
-  }
-
-  for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
-    av1_build_compound_diffwtd_mask_d16_c(
-        mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
-        height, width, &conv_params, bd);
-
-    test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
-              height, width, &conv_params, bd);
-
-    for (int r = 0; r < height; ++r) {
-      for (int c = 0; c < width; ++c) {
-        ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
-            << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
-            << " Pixel mismatch at index "
-            << "[" << r << "," << c << "] "
-            << " @ " << width << "x" << height << " inv " << mask_type;
-      }
-    }
-  }
-}
-
-void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
-    buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
-  const int block_idx = GET_PARAM(2);
-  const int bd = GET_PARAM(0);
-  const int width = block_size_wide[block_idx];
-  const int height = block_size_high[block_idx];
-  DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
-
-  int in_precision =
-      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
-
-  for (int i = 0; i < MAX_SB_SQUARE; i++) {
-    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
-    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
-  }
-
-  const int num_loops = 10000000 / (width + height);
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-
-  for (int i = 0; i < num_loops; ++i)
-    av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
-                                          width, height, width, &conv_params,
-                                          bd);
-
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-
-  aom_usec_timer timer1;
-  aom_usec_timer_start(&timer1);
-
-  for (int i = 0; i < num_loops; ++i)
-    test_impl(mask, mask_type, src0, width, src1, width, height, width,
-              &conv_params, bd);
-
-  aom_usec_timer_mark(&timer1);
-  const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-  printf("av1_build_compound_diffwtd_mask_d16  %3dx%-3d: %7.2f \n", width,
-         height, elapsed_time / double(elapsed_time1));
-}
-#if HAVE_SSE4_1
-void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
-                                       const int is_speed,
-                                       const DIFFWTD_MASK_TYPE type) {
-  const int sb_type = GET_PARAM(0);
-  const int width = block_size_wide[sb_type];
-  const int height = block_size_high[sb_type];
-  DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  for (int i = 0; i < width * height; i++) {
-    src0[i] = rnd.Rand8();
-    src1[i] = rnd.Rand8();
-  }
-  const int run_times = is_speed ? (10000000 / (width + height)) : 1;
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < run_times; ++i) {
-    av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width,
-                                      height, width);
-  }
-  const double t1 = get_time_mark(&timer);
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < run_times; ++i) {
-    test_impl(mask_test, type, src0, width, src1, width, height, width);
-  }
-  const double t2 = get_time_mark(&timer);
-  if (is_speed) {
-    printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
-    printf("(%3.2f)\n", t1 / t2);
-  }
-  for (int r = 0; r < height; ++r) {
-    for (int c = 0; c < width; ++c) {
-      ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
-          << "[" << r << "," << c << "] " << run_times << " @ " << width << "x"
-          << height << " inv " << type;
-    }
-  }
-}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskTest);
 
 TEST_P(BuildCompDiffwtdMaskTest, match) {
   RunTest(GET_PARAM(1), 0, DIFFWTD_38);
@@ -226,7 +104,246 @@
   RunTest(GET_PARAM(1), 1, DIFFWTD_38);
   RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
 }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
 #endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+using BuildCompDiffWtdMaskHighbdFunc =
+    void (*)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+             int src0_stride, const uint8_t *src1, int src1_stride, int h,
+             int w, int bd);
+
+using BuildCompDiffwtdMaskHighbdParam =
+    std::tuple<BLOCK_SIZE, int, BuildCompDiffWtdMaskHighbdFunc>;
+
+#if HAVE_SSSE3 || HAVE_AVX2 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskHighbdParam>
+BuildParamsHighbd(BuildCompDiffWtdMaskHighbdFunc filter) {
+  return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+                            ::testing::Values(8, 10, 12),
+                            ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskHighbdTest
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskHighbdParam> {
+ public:
+  BuildCompDiffwtdMaskHighbdTest() : rnd_(ACMRandom::DeterministicSeed()) {}
+  ~BuildCompDiffwtdMaskHighbdTest() override = default;
+
+ protected:
+  void RunTest(BuildCompDiffWtdMaskHighbdFunc test_impl, bool is_speed,
+               const DIFFWTD_MASK_TYPE type) {
+    const int sb_type = GET_PARAM(0);
+    const int bd = GET_PARAM(1);
+    const int width = block_size_wide[sb_type];
+    const int height = block_size_high[sb_type];
+    const int mask = (1 << bd) - 1;
+    DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint16_t, src0[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint16_t, src1[MAX_SB_SQUARE]);
+    for (int i = 0; i < width * height; i++) {
+      src0[i] = rnd_.Rand16() & mask;
+      src1[i] = rnd_.Rand16() & mask;
+    }
+    const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+    aom_usec_timer timer;
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      uint8_t *src0_8 = CONVERT_TO_BYTEPTR(src0);
+      uint8_t *src1_8 = CONVERT_TO_BYTEPTR(src1);
+      av1_build_compound_diffwtd_mask_highbd_c(
+          mask_ref, type, src0_8, width, src1_8, width, height, width, bd);
+    }
+    const double t1 = get_time_mark(&timer);
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      uint8_t *src0_8 = CONVERT_TO_BYTEPTR(src0);
+      uint8_t *src1_8 = CONVERT_TO_BYTEPTR(src1);
+      test_impl(mask_test, type, src0_8, width, src1_8, width, height, width,
+                bd);
+    }
+    const double t2 = get_time_mark(&timer);
+
+    if (is_speed) {
+      printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+      printf("(%3.2f)\n", t1 / t2);
+    }
+    for (int r = 0; r < height; ++r) {
+      for (int c = 0; c < width; ++c) {
+        ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+            << "[" << r << "," << c << "] " << run_times << " @ " << width
+            << "x" << height << " inv " << type;
+      }
+    }
+  }
+
+ private:
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskHighbdTest);
+
+TEST_P(BuildCompDiffwtdMaskHighbdTest, match) {
+  RunTest(GET_PARAM(2), 0, DIFFWTD_38);
+  RunTest(GET_PARAM(2), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskHighbdTest, DISABLED_Speed) {
+  RunTest(GET_PARAM(2), 1, DIFFWTD_38);
+  RunTest(GET_PARAM(2), 1, DIFFWTD_38_INV);
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, BuildCompDiffwtdMaskHighbdTest,
+    BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, BuildCompDiffwtdMaskHighbdTest,
+    BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BuildCompDiffwtdMaskHighbdTest,
+    BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_neon));
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+using BuildCompDiffWtdMaskD16Func = void (*)(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd);
+
+using BuildCompDiffwtdMaskD16Param =
+    std::tuple<int, BuildCompDiffWtdMaskD16Func, BLOCK_SIZE>;
+
+#if HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
+    BuildCompDiffWtdMaskD16Func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif
+
+class BuildCompDiffwtdMaskD16Test
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
+ public:
+  BuildCompDiffwtdMaskD16Test() : rnd_(ACMRandom::DeterministicSeed()) {}
+  ~BuildCompDiffwtdMaskD16Test() override = default;
+
+ protected:
+  void RunCheckOutput(BuildCompDiffWtdMaskD16Func test_impl) {
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    const int width = block_size_wide[block_idx];
+    const int height = block_size_high[block_idx];
+    DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+    ConvolveParams conv_params =
+        get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
+
+    const int in_precision =
+        bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+    for (int i = 0; i < MAX_SB_SQUARE; i++) {
+      src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+      src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    }
+
+    for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
+      av1_build_compound_diffwtd_mask_d16_c(
+          mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+          height, width, &conv_params, bd);
+
+      test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1,
+                width, height, width, &conv_params, bd);
+
+      for (int r = 0; r < height; ++r) {
+        for (int c = 0; c < width; ++c) {
+          ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+              << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
+              << " Pixel mismatch at index "
+              << "[" << r << "," << c << "] "
+              << " @ " << width << "x" << height << " inv " << mask_type;
+        }
+      }
+    }
+  }
+
+  void RunSpeedTest(BuildCompDiffWtdMaskD16Func test_impl,
+                    DIFFWTD_MASK_TYPE mask_type) {
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    const int width = block_size_wide[block_idx];
+    const int height = block_size_high[block_idx];
+    DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+    ConvolveParams conv_params =
+        get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
+
+    const int in_precision =
+        bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+    for (int i = 0; i < MAX_SB_SQUARE; i++) {
+      src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+      src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    }
+
+    const int num_loops = 10000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
+                                            width, height, width, &conv_params,
+                                            bd);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(mask, mask_type, src0, width, src1, width, height, width,
+                &conv_params, bd);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("av1_build_compound_diffwtd_mask_d16  %3dx%-3d: %7.2f \n", width,
+           height, elapsed_time / double(elapsed_time1));
+  }
+
+ private:
+  ACMRandom rnd_;
+};  // class BuildCompDiffwtdMaskD16Test
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
+
 TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }
@@ -237,18 +354,12 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
-                         BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
-
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BuildCompDiffwtdMaskD16Test,
     BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
-                         BuildParams(av1_build_compound_diffwtd_mask_avx2));
-
 INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
                          BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
 #endif
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 437b8e0..7bad453 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -186,7 +186,7 @@
   }
   int flag_codec_;
   bool change_start_resln_;
-  virtual ~ResizingVideoSource() {}
+  ~ResizingVideoSource() override = default;
 
  protected:
   void Begin() override {
@@ -215,12 +215,12 @@
  protected:
   ResizeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~ResizeTest() {}
+  ~ResizeTest() override = default;
 
-  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+  void SetUp() override { InitializeConfig(GET_PARAM(1)); }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       if (GET_PARAM(1) == ::libaom_test::kRealTime) {
         encoder->Control(AV1E_SET_AQ_MODE, 3);
@@ -230,8 +230,8 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
@@ -279,15 +279,15 @@
   ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeInternalTestLarge() {}
+  ~ResizeInternalTestLarge() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("av10-2-05-resize.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
@@ -298,8 +298,8 @@
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
     if (change_config_) {
       int new_q = 60;
       if (video->frame() == 0) {
@@ -323,13 +323,13 @@
     }
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 4.1);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -402,11 +402,12 @@
  protected:
   ResizeRealtimeTest()
       : EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)),
-        set_scale_mode_(false), set_scale_mode2_(false) {}
-  virtual ~ResizeRealtimeTest() {}
+        set_scale_mode_(false), set_scale_mode2_(false),
+        set_scale_mode3_(false) {}
+  ~ResizeRealtimeTest() override = default;
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_AQ_MODE, 3);
       encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
@@ -433,6 +434,13 @@
       else if (video->frame() > 40)
         mode = { AOME_THREEFOUR, AOME_THREEFOUR };
       encoder->Control(AOME_SET_SCALEMODE, &mode);
+    } else if (set_scale_mode3_) {
+      struct aom_scaling_mode mode;
+      if (video->frame() <= 30)
+        mode = { AOME_ONETWO, AOME_NORMAL };
+      else
+        mode = { AOME_NORMAL, AOME_NORMAL };
+      encoder->Control(AOME_SET_SCALEMODE, &mode);
     }
 
     if (change_bitrate_ && video->frame() == frame_change_bitrate_) {
@@ -442,17 +450,17 @@
     }
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
-  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+  void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
@@ -483,7 +491,7 @@
     // the width and height of the frame are swapped
     cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
         AOMMAX(kInitialWidth, kInitialHeight);
-    if (set_scale_mode_ || set_scale_mode2_) {
+    if (set_scale_mode_ || set_scale_mode2_ || set_scale_mode3_) {
       cfg_.rc_dropframe_thresh = 0;
       cfg_.g_forced_max_frame_width = 1280;
       cfg_.g_forced_max_frame_height = 1280;
@@ -499,6 +507,7 @@
   int mismatch_nframes_;
   bool set_scale_mode_;
   bool set_scale_mode2_;
+  bool set_scale_mode3_;
 };
 
 // Check the AOME_SET_SCALEMODE control by downsizing to
@@ -509,6 +518,7 @@
   cfg_.g_h = 720;
   set_scale_mode_ = true;
   set_scale_mode2_ = false;
+  set_scale_mode3_ = false;
   DefaultConfig();
   change_bitrate_ = false;
   mismatch_nframes_ = 0;
@@ -544,6 +554,7 @@
   cfg_.g_h = 180;
   set_scale_mode_ = true;
   set_scale_mode2_ = false;
+  set_scale_mode3_ = false;
   DefaultConfig();
   change_bitrate_ = false;
   mismatch_nframes_ = 0;
@@ -578,6 +589,7 @@
   cfg_.g_h = 720;
   set_scale_mode_ = false;
   set_scale_mode2_ = true;
+  set_scale_mode3_ = false;
   DefaultConfig();
   change_bitrate_ = false;
   mismatch_nframes_ = 0;
@@ -604,12 +616,45 @@
   }
 }
 
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2 horizontally only and then back up to original.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode3) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
+  set_scale_mode3_ = true;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 640;
+    unsigned int expected_h = 720;
+    if (frame > 30) {
+      expected_w = 1280;
+      expected_h = 720;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
   video.flag_codec_ = 1;
   change_bitrate_ = false;
   set_scale_mode_ = false;
   set_scale_mode2_ = false;
+  set_scale_mode3_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
   DefaultConfig();
@@ -651,6 +696,7 @@
   change_bitrate_ = false;
   set_scale_mode_ = false;
   set_scale_mode2_ = false;
+  set_scale_mode3_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
   DefaultConfig();
@@ -700,6 +746,7 @@
   frame_change_bitrate_ = 120;
   set_scale_mode_ = false;
   set_scale_mode2_ = false;
+  set_scale_mode3_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
   DefaultConfig();
@@ -757,15 +804,15 @@
   ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeCspTest() {}
+  ~ResizeCspTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("av11-2-05-cspchape.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
@@ -776,13 +823,13 @@
 #endif
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -809,7 +856,7 @@
     limit_ = 30;
   }
 
-  virtual ~ResizingCspVideoSource() {}
+  ~ResizingCspVideoSource() override = default;
 };
 
 #if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH) || \
@@ -845,9 +892,9 @@
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         resize_mode_(GET_PARAM(2)), resize_denominator_(GET_PARAM(3)),
         resize_kf_denominator_(GET_PARAM(4)), cpu_used_(GET_PARAM(5)) {}
-  virtual ~ResizeModeTestLarge() {}
+  ~ResizeModeTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -861,8 +908,8 @@
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index 735d799..f1f9e01 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -94,9 +94,9 @@
         aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
         tile_columns_(GET_PARAM(5)), tile_rows_(GET_PARAM(6)) {}
 
-  virtual ~RTEndToEndTest() {}
+  ~RTEndToEndTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kRealTime);
 
     cfg_.g_threads = threads_;
@@ -107,18 +107,18 @@
     cfg_.kf_min_dist = 9999;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
       encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0a39ca6..5212748 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -39,12 +39,6 @@
                                   const uint8_t *second_pred);
 typedef std::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
 
-typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, const uint8_t *ref,
-                                   int ref_stride,
-                                   const DIST_WTD_COMP_PARAMS *jcp_param);
-typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
-
 typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
                                           int src_stride,
                                           const uint8_t *ref_ptr,
@@ -138,15 +132,13 @@
     comp_pred16_test_ = nullptr;
   }
 
-  virtual void TearDown() {}
-
  protected:
   // Handle up to 4 128x128 blocks, with stride up to 256
   static const int kDataAlignment = 16;
   static const int kDataBlockSize = 128 * 256;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
-  virtual void SetUp() {
+  void SetUp() override {
     if (bd_ == -1) {
       use_high_bit_depth_ = false;
       bit_depth_ = AOM_BITS_8;
@@ -255,31 +247,6 @@
     return sad;
   }
 
-  void ReferenceDistWtdCompAvg(int block_idx) {
-    const uint8_t *const reference8 = GetReference(block_idx);
-    const uint8_t *const second_pred8 = second_pred_;
-    uint8_t *const comp_pred8 = comp_pred_;
-    const uint16_t *const reference16 =
-        CONVERT_TO_SHORTPTR(GetReference(block_idx));
-    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
-    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
-    for (int h = 0; h < height_; ++h) {
-      for (int w = 0; w < width_; ++w) {
-        if (!use_high_bit_depth_) {
-          const int tmp =
-              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
-              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
-          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
-        } else {
-          const int tmp =
-              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
-              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
-          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
-        }
-      }
-    }
-  }
-
   unsigned int ReferenceDistWtdSADavg(int block_idx) {
     unsigned int sad = 0;
     const uint8_t *const reference8 = GetReference(block_idx);
@@ -401,7 +368,7 @@
   }
 
   void SADForSpeedTest(unsigned int *results,
-                       const uint8_t *const *references) {
+                       const uint8_t *const *references) override {
     GET_PARAM(2)
     (source_data_, source_stride_, references, reference_stride_, results);
   }
@@ -432,7 +399,7 @@
   }
 
   void SADForSpeedTest(unsigned int *results,
-                       const uint8_t *const *references) {
+                       const uint8_t *const *references) override {
     GET_PARAM(2)
     (source_data_, source_stride_, references, reference_stride_, results);
   }
@@ -475,7 +442,7 @@
   }
 
   void SADForSpeedTest(unsigned int *results,
-                       const uint8_t *const *references) {
+                       const uint8_t *const *references) override {
     GET_PARAM(2)
     (source_data_, source_stride_, references, reference_stride_, results);
   }
@@ -504,7 +471,7 @@
   }
 
   void SADForSpeedTest(unsigned int *results,
-                       const uint8_t *const *references) {
+                       const uint8_t *const *references) override {
     GET_PARAM(2)
     (source_data_, source_stride_, references[0], reference_stride_);
     (void)results;
@@ -534,7 +501,7 @@
   }
 
   void SADForSpeedTest(unsigned int *results,
-                       const uint8_t *const *references) {
+                       const uint8_t *const *references) override {
     GET_PARAM(2)
     (source_data_, source_stride_, references[0], reference_stride_);
     (void)results;
@@ -565,40 +532,6 @@
   }
 };
 
-class DistWtdCompAvgTest
-    : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
-      public SADTestBase {
- public:
-  DistWtdCompAvgTest()
-      : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
-
- protected:
-  void dist_wtd_comp_avg(int block_idx) {
-    const uint8_t *const reference = GetReference(block_idx);
-
-    API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
-                                          height_, reference, reference_stride_,
-                                          &jcp_param_));
-  }
-
-  void CheckCompAvg() {
-    for (int j = 0; j < 2; ++j) {
-      for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
-        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
-
-        ReferenceDistWtdCompAvg(0);
-        dist_wtd_comp_avg(0);
-
-        for (int y = 0; y < height_; ++y)
-          for (int x = 0; x < width_; ++x)
-            ASSERT_EQ(comp_pred_[y * width_ + x],
-                      comp_pred_test_[y * width_ + x]);
-      }
-    }
-  }
-};
-
 class DistWtdSADavgTest
     : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
       public SADTestBase {
@@ -807,38 +740,6 @@
   source_stride_ = tmp_stride;
 }
 
-TEST_P(DistWtdCompAvgTest, MaxRef) {
-  FillConstant(reference_data_, reference_stride_, mask_);
-  FillConstant(second_pred_, width_, 0);
-  CheckCompAvg();
-}
-
-TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
-  FillConstant(reference_data_, reference_stride_, 0);
-  FillConstant(second_pred_, width_, mask_);
-  CheckCompAvg();
-}
-
-TEST_P(DistWtdCompAvgTest, ShortRef) {
-  const int tmp_stride = reference_stride_;
-  reference_stride_ >>= 1;
-  FillRandom(reference_data_, reference_stride_);
-  FillRandom(second_pred_, width_);
-  CheckCompAvg();
-  reference_stride_ = tmp_stride;
-}
-
-TEST_P(DistWtdCompAvgTest, UnalignedRef) {
-  // The reference frame, but not the source frame, may be unaligned for
-  // certain types of searches.
-  const int tmp_stride = reference_stride_;
-  reference_stride_ -= 1;
-  FillRandom(reference_data_, reference_stride_);
-  FillRandom(second_pred_, width_);
-  CheckCompAvg();
-  reference_stride_ = tmp_stride;
-}
-
 TEST_P(DistWtdSADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -1445,38 +1346,6 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
-// TODO(chengchen): add highbd tests
-const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
-  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-
-#if !CONFIG_REALTIME_ONLY
-  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-#endif
-};
-
-INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
-                         ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
-
 const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
   make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
   make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
@@ -2227,6 +2096,56 @@
   make_tuple(8, 4, &aom_sad8x4_avg_neon, -1),
   make_tuple(4, 8, &aom_sad4x8_avg_neon, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_avg_neon, -1),
   make_tuple(32, 8, &aom_sad32x8_avg_neon, -1),
@@ -2234,10 +2153,61 @@
   make_tuple(16, 4, &aom_sad16x4_avg_neon, -1),
   make_tuple(8, 32, &aom_sad8x32_avg_neon, -1),
   make_tuple(4, 16, &aom_sad4x16_avg_neon, -1),
-#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
 
+const DistWtdSadMxNAvgParam dist_wtd_avg_neon_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_neon, -1),
+  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_neon, -1),
+  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_neon, -1),
+  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_neon, -1),
+  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_neon, -1),
+  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_neon, -1),
+  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_neon, -1),
+  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_neon, -1),
+  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_neon, -1),
+  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_neon, -1),
+  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_neon, -1),
+  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_neon, -1),
+  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_neon, -1),
+  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_neon, -1),
+  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_neon, -1),
+  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_neon, -1),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_neon, -1),
+  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_neon, -1),
+  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_neon, -1),
+  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_neon, -1),
+  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_neon, -1),
+  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_neon, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, DistWtdSADavgTest,
+                         ::testing::ValuesIn(dist_wtd_avg_neon_tests));
+
 const SadMxNx4Param x3d_neon_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x3d_neon, -1),
   make_tuple(128, 64, &aom_sad128x64x3d_neon, -1),
@@ -2255,6 +2225,56 @@
   make_tuple(8, 4, &aom_sad8x4x3d_neon, -1),
   make_tuple(4, 8, &aom_sad4x8x3d_neon, -1),
   make_tuple(4, 4, &aom_sad4x4x3d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x3d_neon, -1),
   make_tuple(32, 8, &aom_sad32x8x3d_neon, -1),
@@ -2262,12 +2282,189 @@
   make_tuple(16, 4, &aom_sad16x4x3d_neon, -1),
   make_tuple(8, 32, &aom_sad8x32x3d_neon, -1),
   make_tuple(4, 16, &aom_sad4x16x3d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx3Test, ::testing::ValuesIn(x3d_neon_tests));
 
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad128x64_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad64x128_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad64x64_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad64x32_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad32x64_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad32x32_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad32x16_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad16x32_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad16x16_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad16x8_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad32x8_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad16x64_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad16x4_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+                         ::testing::ValuesIn(neon_dotprod_tests));
+
+const SadMxNParam skip_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_dotprod_tests));
+
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16_avg_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad32x8_avg_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad16x4_avg_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+                         ::testing::ValuesIn(avg_neon_dotprod_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, DistWtdSADavgTest,
+                         ::testing::ValuesIn(dist_wtd_avg_neon_dotprod_tests));
+
+const SadMxNx4Param x3d_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x3d_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad128x64x3d_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad64x128x3d_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad64x64x3d_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad64x32x3d_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad32x64x3d_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad32x32x3d_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad32x16x3d_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad16x32x3d_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad16x16x3d_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad16x8x3d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16x3d_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad32x8x3d_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad16x64x3d_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad16x4x3d_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx3Test,
+                         ::testing::ValuesIn(x3d_neon_dotprod_tests));
+
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16x4d_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+                         ::testing::ValuesIn(x4d_neon_dotprod_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4x4d_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 //------------------------------------------------------------------------------
 // x86 functions
 #if HAVE_SSE2
@@ -2750,39 +2947,6 @@
 // Only functions are x3, which do not have tests.
 #endif  // HAVE_SSE3
 
-#if HAVE_SSSE3
-const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
-  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-#if !CONFIG_REALTIME_ONLY
-  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-#endif
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
-                         ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
-#endif  // HAVE_SSSE3
-
 #if HAVE_SSE4_1
 // Only functions are x8, which do not have tests.
 #endif  // HAVE_SSE4_1
diff --git a/test/sb_multipass_test.cc b/test/sb_multipass_test.cc
index 8ddc002..e27a2c6 100644
--- a/test/sb_multipass_test.cc
+++ b/test/sb_multipass_test.cc
@@ -42,9 +42,9 @@
     md5_dec_.clear();
     md5_enc_.clear();
   }
-  virtual ~AV1SBMultipassTest() { delete decoder_; }
+  ~AV1SBMultipassTest() override { delete decoder_; }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(::libaom_test::kTwoPassGood);
 
     cfg_.g_lag_in_frames = 5;
@@ -56,8 +56,8 @@
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       SetTileSize(encoder);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
@@ -75,7 +75,7 @@
     encoder->Control(AV1E_SET_TILE_ROWS, 1);
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     size_enc_.push_back(pkt->data.frame.sz);
 
     ::libaom_test::MD5 md5_enc;
diff --git a/test/scalability_test.cc b/test/scalability_test.cc
index 9ea8256..12cb03c 100644
--- a/test/scalability_test.cc
+++ b/test/scalability_test.cc
@@ -26,15 +26,15 @@
       public ::libaom_test::EncoderTest {
  protected:
   ScalabilityTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ScalabilityTest() {}
+  ~ScalabilityTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     num_spatial_layers_ = 2;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
       encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_);
diff --git a/test/screen_content_test.cc b/test/screen_content_test.cc
index 4d3e09a..974c50b 100644
--- a/test/screen_content_test.cc
+++ b/test/screen_content_test.cc
@@ -29,9 +29,9 @@
     is_screen_content_violated_ = true;
     tune_content_ = AOM_CONTENT_DEFAULT;
   }
-  virtual ~ScreenContentToolsTestLarge() {}
+  ~ScreenContentToolsTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -42,10 +42,10 @@
     cfg_.g_profile = 0;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -53,8 +53,8 @@
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index a8461b5..3dd513b 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -30,9 +30,9 @@
 using std::make_tuple;
 using std::tuple;
 
-typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
-                        int eps, const int *xqd, uint8_t *dst8, int dst_stride,
-                        int32_t *tmpbuf, int bit_depth, int highbd);
+typedef int (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
+                       int eps, const int *xqd, uint8_t *dst8, int dst_stride,
+                       int32_t *tmpbuf, int bit_depth, int highbd);
 
 // Test parameter list:
 //  <tst_fun_>
@@ -41,10 +41,8 @@
 class AV1SelfguidedFilterTest
     : public ::testing::TestWithParam<FilterTestParam> {
  public:
-  virtual ~AV1SelfguidedFilterTest() {}
-  virtual void SetUp() {}
-
-  virtual void TearDown() {}
+  ~AV1SelfguidedFilterTest() override = default;
+  void SetUp() override {}
 
  protected:
   void RunSpeedTest() {
@@ -91,9 +89,10 @@
           int h = AOMMIN(pu_height, height - k);
           uint8_t *input_p = input + k * stride + j;
           uint8_t *output_p = output + k * out_stride + j;
-          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                             output_p, out_stride, tmpbuf, 8,
-                                             0);
+          const int ret_c = av1_apply_selfguided_restoration_c(
+              input_p, w, h, stride, eps, xqd, output_p, out_stride, tmpbuf, 8,
+              0);
+          ASSERT_EQ(ret_c, 0);
         }
     }
     aom_usec_timer_mark(&ref_timer);
@@ -108,8 +107,9 @@
           int h = AOMMIN(pu_height, height - k);
           uint8_t *input_p = input + k * stride + j;
           uint8_t *output_p = output + k * out_stride + j;
-          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
-                   tmpbuf, 8, 0);
+          const int ret_tst = tst_fun_(input_p, w, h, stride, eps, xqd,
+                                       output_p, out_stride, tmpbuf, 8, 0);
+          ASSERT_EQ(ret_tst, 0);
         }
     }
     aom_usec_timer_mark(&tst_timer);
@@ -181,11 +181,13 @@
           uint8_t *input_p = input + k * stride + j;
           uint8_t *output_p = output + k * out_stride + j;
           uint8_t *output2_p = output2 + k * out_stride + j;
-          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
-                   tmpbuf, 8, 0);
-          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                             output2_p, out_stride, tmpbuf, 8,
-                                             0);
+          const int ret_tst = tst_fun_(input_p, w, h, stride, eps, xqd,
+                                       output_p, out_stride, tmpbuf, 8, 0);
+          ASSERT_EQ(ret_tst, 0);
+          const int ret_c = av1_apply_selfguided_restoration_c(
+              input_p, w, h, stride, eps, xqd, output2_p, out_stride, tmpbuf, 8,
+              0);
+          ASSERT_EQ(ret_c, 0);
         }
 
       for (j = 0; j < test_h; ++j)
@@ -234,10 +236,8 @@
 class AV1HighbdSelfguidedFilterTest
     : public ::testing::TestWithParam<HighbdFilterTestParam> {
  public:
-  virtual ~AV1HighbdSelfguidedFilterTest() {}
-  virtual void SetUp() {}
-
-  virtual void TearDown() {}
+  ~AV1HighbdSelfguidedFilterTest() override = default;
+  void SetUp() override {}
 
  protected:
   void RunSpeedTest() {
diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc
index 49c5804..64465c8 100644
--- a/test/sharpness_test.cc
+++ b/test/sharpness_test.cc
@@ -48,7 +48,7 @@
         cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0),
         nframes_(0) {}
 
-  ~SharpnessTest() override {}
+  ~SharpnessTest() override = default;
 
   void SetUp() override {
     InitializeConfig(encoding_mode_);
diff --git a/test/simd_cmp_neon.cc b/test/simd_cmp_neon.cc
deleted file mode 100644
index 53c1e2a..0000000
--- a/test/simd_cmp_neon.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__
-#define ARCH NEON
-#define ARCH_POSTFIX(name) name##_neon
-#define SIMD_NAMESPACE simd_test_neon
-#include "test/simd_cmp_impl.h"
-#endif
diff --git a/test/simd_impl.h b/test/simd_impl.h
index 8535e37..b564a7f 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h
@@ -22,15 +22,13 @@
 template <typename param_signature>
 class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
  public:
-  virtual ~TestIntrinsic() {}
-  virtual void SetUp() {
+  ~TestIntrinsic() override = default;
+  void SetUp() override {
     mask = std::get<0>(this->GetParam());
     maskwidth = std::get<1>(this->GetParam());
     name = std::get<2>(this->GetParam());
   }
 
-  virtual void TearDown() {}
-
  protected:
   uint32_t mask, maskwidth;
   const char *name;
diff --git a/test/simd_neon_test.cc b/test/simd_neon_test.cc
deleted file mode 100644
index b67b188..0000000
--- a/test/simd_neon_test.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__
-#define ARCH NEON
-#define ARCH_POSTFIX(name) name##_neon
-#define SIMD_NAMESPACE simd_test_neon
-#include "test/simd_impl.h"
-#endif
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
index 68355ec..70d8da5 100644
--- a/test/sse_sum_test.cc
+++ b/test/sse_sum_test.cc
@@ -41,15 +41,15 @@
 
 class SumSSETest : public ::testing::TestWithParam<TestFuncs> {
  public:
-  virtual ~SumSSETest() {}
-  virtual void SetUp() {
+  ~SumSSETest() override = default;
+  void SetUp() override {
     params_ = this->GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() { aom_free(src_); }
+  void TearDown() override { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
diff --git a/test/still_picture_test.cc b/test/still_picture_test.cc
index e2eef94..3dfb1c8 100644
--- a/test/still_picture_test.cc
+++ b/test/still_picture_test.cc
@@ -27,9 +27,9 @@
         enable_full_header_(GET_PARAM(2)) {
     still_picture_coding_violated_ = false;
   }
-  virtual ~StillPicturePresenceTest() {}
+  ~StillPicturePresenceTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -39,18 +39,18 @@
     cfg_.g_limit = 1;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AV1E_SET_FORCE_VIDEO_MODE, 0);
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 4003e51..e591e65 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -49,15 +49,15 @@
     func_ = func;
     ref_func_ = ref_func;
     if (bit_depth == -1) {
-      hbd_ = 0;
+      hbd_ = false;
       bit_depth_ = AOM_BITS_8;
     } else {
-      hbd_ = 1;
+      hbd_ = true;
       bit_depth_ = static_cast<aom_bit_depth_t>(bit_depth);
     }
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
 
     const size_t max_width = 128;
@@ -82,7 +82,7 @@
     ASSERT_NE(diff_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (hbd_) {
       aom_free(CONVERT_TO_SHORTPTR(src_));
       aom_free(CONVERT_TO_SHORTPTR(pred_));
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 91f172d..cba33b7 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -20,6 +20,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_ports/mem.h"
+#include "av1/common/common_data.h"
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -43,16 +44,16 @@
 
 class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
  public:
-  virtual ~SumSquaresTest() {}
-  virtual void SetUp() {
+  ~SumSquaresTest() override = default;
+  void SetUp() override {
     params_ = this->GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() { aom_free(src_); }
-  void RunTest(int isRandom);
+  void TearDown() override { aom_free(src_); }
+  void RunTest(bool is_random);
   void RunSpeedTest();
 
   void GenRandomData(int width, int height, int stride) {
@@ -83,7 +84,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
 
-void SumSquaresTest::RunTest(int isRandom) {
+void SumSquaresTest::RunTest(bool is_random) {
   int failed = 0;
   for (int k = 0; k < kNumIterations; k++) {
     const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
@@ -92,7 +93,7 @@
     while (stride < width) {                // Make sure it's valid
       stride = 4 << rnd_(7);
     }
-    if (isRandom) {
+    if (is_random) {
       GenRandomData(width, height, stride);
     } else {
       GenExtremeData(width, height, stride);
@@ -144,11 +145,11 @@
 }
 
 TEST_P(SumSquaresTest, OperationCheck) {
-  RunTest(1);  // GenRandomData
+  RunTest(true);  // GenRandomData
 }
 
 TEST_P(SumSquaresTest, ExtremeValues) {
-  RunTest(0);  // GenExtremeData
+  RunTest(false);  // GenExtremeData
 }
 
 TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
@@ -182,7 +183,7 @@
 // 1D version
 //////////////////////////////////////////////////////////////////////////////
 
-typedef uint64_t (*F1D)(const int16_t *src, uint32_t N);
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t n);
 typedef libaom_test::FuncParam<F1D> TestFuncs1D;
 
 class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
@@ -199,12 +200,12 @@
     for (int i = 0; i < kMaxSize * kMaxSize; ++i)
       src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
 
-    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+    const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
                           : rng_(kMaxSize) + 1;
 
-    const uint64_t ref_res = params_.ref_func(src, N);
+    const uint64_t ref_res = params_.ref_func(src, n);
     uint64_t tst_res;
-    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -220,12 +221,12 @@
       for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
     }
 
-    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+    const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
                           : rng_(kMaxSize) + 1;
 
-    const uint64_t ref_res = params_.ref_func(src, N);
+    const uint64_t ref_res = params_.ref_func(src, n);
     uint64_t tst_res;
-    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -245,23 +246,23 @@
 
 #endif  // HAVE_NEON
 
-typedef int64_t (*sse_func)(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride, int width, int height);
-typedef libaom_test::FuncParam<sse_func> TestSSEFuncs;
+typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int width, int height);
+typedef libaom_test::FuncParam<SSEFunc> TestSSEFuncs;
 
 typedef std::tuple<TestSSEFuncs, int> SSETestParam;
 
 class SSETest : public ::testing::TestWithParam<SSETestParam> {
  public:
-  virtual ~SSETest() {}
-  virtual void SetUp() {
+  ~SSETest() override = default;
+  void SetUp() override {
     params_ = GET_PARAM(0);
     width_ = GET_PARAM(1);
-    isHbd_ =
+    is_hbd_ =
 #if CONFIG_AV1_HIGHBITDEPTH
         params_.ref_func == aom_highbd_sse_c;
 #else
-        0;
+        false;
 #endif
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
@@ -270,25 +271,25 @@
     ASSERT_NE(ref_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(ref_);
   }
-  void RunTest(int isRandom, int width, int height, int run_times);
+  void RunTest(bool is_random, int width, int height, int run_times);
 
   void GenRandomData(int width, int height, int stride) {
-    uint16_t *pSrc = (uint16_t *)src_;
-    uint16_t *pRef = (uint16_t *)ref_;
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+    uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
     const int msb = 11;  // Up to 12 bit input
     const int limit = 1 << (msb + 1);
     for (int ii = 0; ii < height; ii++) {
       for (int jj = 0; jj < width; jj++) {
-        if (!isHbd_) {
+        if (!is_hbd_) {
           src_[ii * stride + jj] = rnd_.Rand8();
           ref_[ii * stride + jj] = rnd_.Rand8();
         } else {
-          pSrc[ii * stride + jj] = rnd_(limit);
-          pRef[ii * stride + jj] = rnd_(limit);
+          src16[ii * stride + jj] = rnd_(limit);
+          ref16[ii * stride + jj] = rnd_(limit);
         }
       }
     }
@@ -296,20 +297,20 @@
 
   void GenExtremeData(int width, int height, int stride, uint8_t *data,
                       int16_t val) {
-    uint16_t *pData = (uint16_t *)data;
+    uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
     for (int ii = 0; ii < height; ii++) {
       for (int jj = 0; jj < width; jj++) {
-        if (!isHbd_) {
-          data[ii * stride + jj] = (uint8_t)val;
+        if (!is_hbd_) {
+          data[ii * stride + jj] = static_cast<uint8_t>(val);
         } else {
-          pData[ii * stride + jj] = val;
+          data16[ii * stride + jj] = val;
         }
       }
     }
   }
 
  protected:
-  int isHbd_;
+  bool is_hbd_;
   int width_;
   TestSSEFuncs params_;
   uint8_t *src_;
@@ -318,7 +319,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
 
-void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
   int failed = 0;
   aom_usec_timer ref_timer, test_timer;
   for (int k = 0; k < 3; k++) {
@@ -326,10 +327,10 @@
     while (stride < width) {    // Make sure it's valid
       stride = 4 << rnd_(7);
     }
-    if (isRandom) {
+    if (is_random) {
       GenRandomData(width, height, stride);
     } else {
-      const int msb = isHbd_ ? 12 : 8;  // Up to 12 bit input
+      const int msb = is_hbd_ ? 12 : 8;  // Up to 12 bit input
       const int limit = (1 << msb) - 1;
       if (k == 0) {
         GenExtremeData(width, height, stride, src_, 0);
@@ -340,18 +341,18 @@
       }
     }
     int64_t res_ref, res_tst;
-    uint8_t *pSrc = src_;
-    uint8_t *pRef = ref_;
-    if (isHbd_) {
-      pSrc = CONVERT_TO_BYTEPTR(src_);
-      pRef = CONVERT_TO_BYTEPTR(ref_);
+    uint8_t *src = src_;
+    uint8_t *ref = ref_;
+    if (is_hbd_) {
+      src = CONVERT_TO_BYTEPTR(src_);
+      ref = CONVERT_TO_BYTEPTR(ref_);
     }
-    res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height);
-    res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height);
+    res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+    res_tst = params_.tst_func(src, stride, ref, stride, width, height);
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < run_times; j++) {
-        params_.ref_func(pSrc, stride, pRef, stride, width, height);
+        params_.ref_func(src, stride, ref, stride, width, height);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -359,7 +360,7 @@
 
       aom_usec_timer_start(&test_timer);
       for (int j = 0; j < run_times; j++) {
-        params_.tst_func(pSrc, stride, pRef, stride, width, height);
+        params_.tst_func(src, stride, ref, stride, width, height);
       }
       aom_usec_timer_mark(&test_timer);
       const int elapsed_time_simd =
@@ -374,7 +375,7 @@
       if (!failed) {
         failed = res_ref != res_tst;
         EXPECT_EQ(res_ref, res_tst)
-            << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
             << width << "x" << height
             << "] C output does not match optimized output.";
       }
@@ -384,19 +385,19 @@
 
 TEST_P(SSETest, OperationCheck) {
   for (int height = 4; height <= 128; height += 4) {
-    RunTest(1, width_, height, 1);  // GenRandomData
+    RunTest(true, width_, height, 1);  // GenRandomData
   }
 }
 
 TEST_P(SSETest, ExtremeValues) {
   for (int height = 4; height <= 128; height += 4) {
-    RunTest(0, width_, height, 1);
+    RunTest(false, width_, height, 1);
   }
 }
 
 TEST_P(SSETest, DISABLED_Speed) {
   for (int height = 4; height <= 128; height += 4) {
-    RunTest(1, width_, height, 100);
+    RunTest(true, width_, height, 100);
   }
 }
 
@@ -411,6 +412,14 @@
                          Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+                         Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif  // HAVE_NEON_DOTPROD
+
 #if HAVE_SSE4_1
 TestSSEFuncs sse_sse4[] = {
   TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
@@ -442,21 +451,20 @@
                              int *x_sum, int64_t *x2_sum);
 typedef libaom_test::FuncParam<sse_sum_func> TestSSE_SumFuncs;
 
-typedef std::tuple<TestSSE_SumFuncs, int> SSE_SumTestParam;
+typedef std::tuple<TestSSE_SumFuncs, TX_SIZE> SSE_SumTestParam;
 
 class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
  public:
-  virtual ~SSE_Sum_Test() {}
-  virtual void SetUp() {
+  ~SSE_Sum_Test() override = default;
+  void SetUp() override {
     params_ = GET_PARAM(0);
-    width_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() { aom_free(src_); }
-  void RunTest(int isRandom, int width, int height, int run_times);
+  void TearDown() override { aom_free(src_); }
+  void RunTest(bool is_random, int tx_size, int run_times);
 
   void GenRandomData(int width, int height, int stride) {
     const int msb = 11;  // Up to 12 bit input
@@ -478,21 +486,22 @@
   }
 
  protected:
-  int width_;
   TestSSE_SumFuncs params_;
   int16_t *src_;
   ACMRandom rnd_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSE_Sum_Test);
 
-void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
+void SSE_Sum_Test::RunTest(bool is_random, int tx_size, int run_times) {
   aom_usec_timer ref_timer, test_timer;
+  int width = tx_size_wide[tx_size];
+  int height = tx_size_high[tx_size];
   for (int k = 0; k < 3; k++) {
     int stride = 4 << rnd_(7);  // Up to 256 stride
     while (stride < width) {    // Make sure it's valid
       stride = 4 << rnd_(7);
     }
-    if (isRandom) {
+    if (is_random) {
       GenRandomData(width, height, stride);
     } else {
       const int msb = 12;  // Up to 12 bit input
@@ -547,37 +556,45 @@
 }
 
 TEST_P(SSE_Sum_Test, OperationCheck) {
-  for (int height = 4; height <= 64; height = height * 2) {
-    RunTest(1, width_, height, 1);  // GenRandomData
-  }
+  RunTest(true, GET_PARAM(1), 1);  // GenRandomData
 }
 
-TEST_P(SSE_Sum_Test, ExtremeValues) {
-  for (int height = 4; height <= 64; height = height * 2) {
-    RunTest(0, width_, height, 1);
-  }
-}
+TEST_P(SSE_Sum_Test, ExtremeValues) { RunTest(false, GET_PARAM(1), 1); }
 
-TEST_P(SSE_Sum_Test, DISABLED_Speed) {
-  for (int height = 4; height <= 64; height = height * 2) {
-    RunTest(1, width_, height, 10000);
-  }
-}
+TEST_P(SSE_Sum_Test, DISABLED_Speed) { RunTest(true, GET_PARAM(1), 10000); }
+
+#if HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
+const TX_SIZE kValidBlockSize[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32,
+                                    TX_64X64, TX_4X8,   TX_8X4,   TX_8X16,
+                                    TX_16X8,  TX_16X32, TX_32X16, TX_64X32,
+                                    TX_32X64, TX_4X16,  TX_16X4,  TX_8X32,
+                                    TX_32X8,  TX_16X64, TX_64X16 };
+#endif
 
 #if HAVE_SSE2
 TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs(
     &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) };
 INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test,
-                         Combine(ValuesIn(sse_sum_sse2), Range(4, 65, 4)));
+                         Combine(ValuesIn(sse_sum_sse2),
+                                 ValuesIn(kValidBlockSize)));
 #endif  // HAVE_SSE2
 
 #if HAVE_AVX2
 TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs(
     &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) };
 INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test,
-                         Combine(ValuesIn(sse_sum_avx2), Range(4, 65, 4)));
+                         Combine(ValuesIn(sse_sum_avx2),
+                                 ValuesIn(kValidBlockSize)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+TestSSE_SumFuncs sse_sum_neon[] = { TestSSE_SumFuncs(
+    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, SSE_Sum_Test,
+                         Combine(ValuesIn(sse_sum_neon),
+                                 ValuesIn(kValidBlockSize)));
+#endif  // HAVE_NEON
+
 //////////////////////////////////////////////////////////////////////////////
 // 2D Variance test functions
 //////////////////////////////////////////////////////////////////////////////
@@ -589,8 +606,8 @@
 
 class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
  public:
-  virtual ~Lowbd2dVarTest() {}
-  virtual void SetUp() {
+  ~Lowbd2dVarTest() override = default;
+  void SetUp() override {
     params_ = this->GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint8_t *>(
@@ -598,8 +615,8 @@
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() { aom_free(src_); }
-  void RunTest(int isRandom);
+  void TearDown() override { aom_free(src_); }
+  void RunTest(bool is_random);
   void RunSpeedTest();
 
   void GenRandomData(int width, int height, int stride) {
@@ -630,7 +647,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Lowbd2dVarTest);
 
-void Lowbd2dVarTest::RunTest(int isRandom) {
+void Lowbd2dVarTest::RunTest(bool is_random) {
   int failed = 0;
   for (int k = 0; k < kNumIterations; k++) {
     const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
@@ -639,7 +656,7 @@
     while (stride < width) {                // Make sure it's valid
       stride = 4 << rnd_(8);
     }
-    if (isRandom) {
+    if (is_random) {
       GenRandomData(width, height, stride);
     } else {
       GenExtremeData(width, height, stride);
@@ -690,11 +707,11 @@
 }
 
 TEST_P(Lowbd2dVarTest, OperationCheck) {
-  RunTest(1);  // GenRandomData
+  RunTest(true);  // GenRandomData
 }
 
 TEST_P(Lowbd2dVarTest, ExtremeValues) {
-  RunTest(0);  // GenExtremeData
+  RunTest(false);  // GenExtremeData
 }
 
 TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
@@ -723,10 +740,18 @@
 
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, Lowbd2dVarTest,
+                         ::testing::Values(TestFuncVar2D(
+                             &aom_var_2d_u8_c, &aom_var_2d_u8_neon_dotprod)));
+
+#endif  // HAVE_NEON_DOTPROD
+
 class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
  public:
-  virtual ~Highbd2dVarTest() {}
-  virtual void SetUp() {
+  ~Highbd2dVarTest() override = default;
+  void SetUp() override {
     params_ = this->GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint16_t *>(
@@ -734,8 +759,8 @@
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() { aom_free(src_); }
-  void RunTest(int isRandom);
+  void TearDown() override { aom_free(src_); }
+  void RunTest(bool is_random);
   void RunSpeedTest();
 
   void GenRandomData(int width, int height, int stride) {
@@ -766,7 +791,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Highbd2dVarTest);
 
-void Highbd2dVarTest::RunTest(int isRandom) {
+void Highbd2dVarTest::RunTest(bool is_random) {
   int failed = 0;
   for (int k = 0; k < kNumIterations; k++) {
     const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
@@ -775,7 +800,7 @@
     while (stride < width) {                // Make sure it's valid
       stride = 4 << rnd_(8);
     }
-    if (isRandom) {
+    if (is_random) {
       GenRandomData(width, height, stride);
     } else {
       GenExtremeData(width, height, stride);
@@ -828,11 +853,11 @@
 }
 
 TEST_P(Highbd2dVarTest, OperationCheck) {
-  RunTest(1);  // GenRandomData
+  RunTest(true);  // GenRandomData
 }
 
 TEST_P(Highbd2dVarTest, ExtremeValues) {
-  RunTest(0);  // GenExtremeData
+  RunTest(false);  // GenExtremeData
 }
 
 TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index d99d6a3..cc3fb67 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <climits>
 #include <vector>
 #include "config/aom_config.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -45,22 +46,22 @@
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t pts) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             aom_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
     ++decoded_nframes_;
   }
 
   std::vector<FrameInfo> frame_info_list_;
 
-  virtual int GetNumSpatialLayers() { return number_spatial_layers_; }
+  int GetNumSpatialLayers() override { return number_spatial_layers_; }
 
-  virtual void ResetModel() {
+  void ResetModel() override {
     DatarateTest::ResetModel();
     layer_frame_cnt_ = 0;
     superframe_cnt_ = 0;
@@ -94,10 +95,11 @@
     rps_recovery_frame_ = 0;
     user_define_frame_qp_ = 0;
     set_speed_per_layer_ = false;
+    simulcast_mode_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     int spatial_layer_id = 0;
     current_video_frame_ = video->frame();
     // video->frame() is called every superframe, so we should condition
@@ -144,7 +146,7 @@
         video->frame(), &layer_id_, &ref_frame_config_, &ref_frame_comp_pred_,
         spatial_layer_id, multi_ref_, comp_pred_,
         (video->frame() % cfg_.kf_max_dist) == 0, dynamic_enable_disable_mode_,
-        rps_mode_, rps_recovery_frame_);
+        rps_mode_, rps_recovery_frame_, simulcast_mode_);
     if (intra_only_ == 1 && frame_sync_ > 0) {
       // Set an Intra-only frame on SL0 at frame_sync_.
       // In order to allow decoding to start on SL0 in mid-sequence we need to
@@ -227,7 +229,7 @@
     }
   }
 
-  virtual void PostEncodeFrameHook(::libaom_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
     int num_operating_points;
     encoder->Control(AV1E_GET_NUM_OPERATING_POINTS, &num_operating_points);
     ASSERT_EQ(num_operating_points,
@@ -242,7 +244,7 @@
     }
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
     // Update the layer cumulative  bitrate.
     for (int i = layer_id_.temporal_layer_id; i < number_temporal_layers_;
@@ -254,36 +256,50 @@
       last_pts_ = pkt->data.frame.pts;
       superframe_cnt_++;
     }
+    // For simulcast mode: verify that for first frame to start decoding,
+    // for SL > 0, are Intra-only frames (not Key), whereas SL0 is Key.
+    if (simulcast_mode_ && superframe_cnt_ == (int)frame_to_start_decoding_) {
+      if (layer_id_.spatial_layer_id > 0) {
+        EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+      } else if (layer_id_.spatial_layer_id == 0) {
+        EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+      }
+    }
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     duration_ = ((last_pts_ + 1) * timebase_);
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_;
     }
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     if (drop_frames_ > 0) {
       for (unsigned int i = 0; i < drop_frames_; ++i) {
         if (drop_frames_list_[i] == (unsigned int)superframe_cnt_) {
           std::cout << "             Skipping decoding frame: "
                     << drop_frames_list_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     } else if (intra_only_ == 1) {
       // Only start decoding at frames_to_start_decoding_.
-      if (current_video_frame_ < frame_to_start_decoding_) return 0;
+      if (current_video_frame_ < frame_to_start_decoding_) return false;
       // Only decode base layer for 3SL, for layer_to_decode_ = 0.
       if (layer_to_decode_ == 0 && frame_sync_ > 0 &&
           (layer_frame_cnt_ - 1) % 3 != 0)
-        return 0;
+        return false;
+    } else if (simulcast_mode_) {
+      // Only start decoding at frames_to_start_decoding_ and only
+      // for top spatial layer SL2 (layer_to_decode_).
+      if (current_video_frame_ < frame_to_start_decoding_) return false;
+      if (layer_id_.spatial_layer_id < (int)layer_to_decode_) return false;
     }
-    return 1;
+    return true;
   }
 
-  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+  void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
@@ -345,13 +361,301 @@
     }
   }
 
+  // Simulcast mode for 3 spatial and 3 temporal layers.
+  // No inter-layer predicton, only prediction is temporal and single
+  // reference (LAST).
+  // No overlap in buffer slots between spatial layers. So for example,
+  // SL0 only uses slots 0 and 1.
+  // SL1 only uses slots 2 and 3.
+  // SL2 only uses slots 4 and 5.
+  // All 7 references for each inter-frame must only access buffer slots
+  // for that spatial layer.
+  // On key (super)frames: SL1 and SL2 must have no references set
+  // and must refresh all the slots for that layer only (so 2 and 3
+  // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally
+  // as a Key frame (refresh all slots). SL1/SL2 will be labelled
+  // internally as Intra-only frames that allow that stream to be decoded.
+  // These conditions will allow for each spatial stream to be
+  // independently decodeable.
+  static void ref_config_simulcast3SL3TL(
+      aom_svc_ref_frame_config_t *ref_frame_config,
+      aom_svc_layer_id_t *layer_id, int is_key_frame, int superframe_cnt) {
+    int i;
+    // Initialize all references to 0 (don't use reference).
+    for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+      ref_frame_config->reference[i] = 0;
+    // Initialize as no refresh/update for all slots.
+    for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+    for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0;
+
+    if (is_key_frame) {
+      if (layer_id->spatial_layer_id == 0) {
+        // Assign LAST/GOLDEN to slot 0/1.
+        // Refesh slots 0 and 1 for SL0.
+        // SL0: this will get set to KEY frame internally.
+        ref_frame_config->ref_idx[0] = 0;
+        ref_frame_config->ref_idx[3] = 1;
+        ref_frame_config->refresh[0] = 1;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Assign LAST/GOLDEN to slot 2/3.
+        // Refesh slots 2 and 3 for SL1.
+        // This will get set to Intra-only frame internally.
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->refresh[2] = 1;
+        ref_frame_config->refresh[3] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Assign LAST/GOLDEN to slot 4/5.
+        // Refresh slots 4 and 5 for SL2.
+        // This will get set to Intra-only frame internally.
+        ref_frame_config->ref_idx[0] = 4;
+        ref_frame_config->ref_idx[3] = 5;
+        ref_frame_config->refresh[4] = 1;
+        ref_frame_config->refresh[5] = 1;
+      }
+    } else if (superframe_cnt % 4 == 0) {
+      // Base temporal layer: TL0
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {  // SL0
+        // Reference LAST. Assign all references to either slot
+        // 0 or 1. Here we assign LAST to slot 0, all others to 1.
+        // Update slot 0 (LAST).
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 0;
+        ref_frame_config->refresh[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {  // SL1
+        // Reference LAST. Assign all references to either slot
+        // 2 or 3. Here we assign LAST to slot 2, all others to 3.
+        // Update slot 2 (LAST).
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 3;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[2] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {  // SL2
+        // Reference LAST. Assign all references to either slot
+        // 4 or 5. Here we assign LAST to slot 4, all others to 5.
+        // Update slot 4 (LAST).
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 5;
+        ref_frame_config->ref_idx[0] = 4;
+        ref_frame_config->refresh[4] = 1;
+      }
+    } else if ((superframe_cnt - 1) % 4 == 0) {
+      // First top temporal enhancement layer: TL2
+      layer_id->temporal_layer_id = 2;
+      if (layer_id->spatial_layer_id == 0) {  // SL0
+        // Reference LAST (slot 0). Assign other references to slot 1.
+        // No update/refresh on any slots.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 0;
+      } else if (layer_id->spatial_layer_id == 1) {  // SL1
+        // Reference LAST (slot 2). Assign other references to slot 3.
+        // No update/refresh on any slots.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 3;
+        ref_frame_config->ref_idx[0] = 2;
+      } else if (layer_id->spatial_layer_id == 2) {  // SL2
+        // Reference LAST (slot 4). Assign other references to slot 4.
+        // No update/refresh on any slots.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 5;
+        ref_frame_config->ref_idx[0] = 4;
+      }
+    } else if ((superframe_cnt - 2) % 4 == 0) {
+      // Middle temporal enhancement layer: TL1
+      layer_id->temporal_layer_id = 1;
+      if (layer_id->spatial_layer_id == 0) {  // SL0
+        // Reference LAST (slot 0).
+        // Set GOLDEN to slot 1 and update slot 1.
+        // This will be used as reference for next TL2.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 0;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {  // SL1
+        // Reference LAST (slot 2).
+        // Set GOLDEN to slot 3 and update slot 3.
+        // This will be used as reference for next TL2.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 3;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[3] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {  // SL2
+        // Reference LAST (slot 4).
+        // Set GOLDEN to slot 5 and update slot 5.
+        // This will be used as reference for next TL2.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 5;
+        ref_frame_config->ref_idx[0] = 4;
+        ref_frame_config->refresh[5] = 1;
+      }
+    } else if ((superframe_cnt - 3) % 4 == 0) {
+      // Second top temporal enhancement layer: TL2
+      layer_id->temporal_layer_id = 2;
+      if (layer_id->spatial_layer_id == 0) {  // SL0
+        // Reference LAST (slot 1). Assign other references to slot 0.
+        // No update/refresh on any slots.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {  // SL1
+        // Reference LAST (slot 3). Assign other references to slot 2.
+        // No update/refresh on any slots.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 2;
+        ref_frame_config->ref_idx[0] = 3;
+      } else if (layer_id->spatial_layer_id == 2) {  // SL2
+        // Reference LAST (slot 5). Assign other references to slot 4.
+        // No update/refresh on any slots.
+        ref_frame_config->reference[0] = 1;
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 4;
+        ref_frame_config->ref_idx[0] = 5;
+      }
+    }
+  }
+
+  // 3 spatial and 3 temporal layer.
+  // Overlap in the buffer slot updates: the slots 3 and 4 updated by
+  // first TL2 are reused for update in TL1 superframe.
+  static void ref_config_3SL3TL(aom_svc_ref_frame_config_t *ref_frame_config,
+                                aom_svc_layer_id_t *layer_id, int is_key_frame,
+                                int superframe_cnt) {
+    if (superframe_cnt % 4 == 0) {
+      // Base temporal layer.
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST.
+        // Set all buffer_idx to 0.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->refresh[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+        // GOLDEN (and all other refs) to slot 0.
+        // Update slot 1 (LAST).
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+        // GOLDEN (and all other refs) to slot 1.
+        // Update slot 2 (LAST).
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[2] = 1;
+      }
+    } else if ((superframe_cnt - 1) % 4 == 0) {
+      // First top temporal enhancement layer.
+      layer_id->temporal_layer_id = 2;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST (slot 0).
+        // Set GOLDEN to slot 3 and update slot 3.
+        // Set all other buffer_idx to slot 0.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->refresh[3] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+        // GOLDEN (and all other refs) to slot 3.
+        // Set LAST2 to slot 4 and Update slot 4.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[1] = 4;
+        ref_frame_config->refresh[4] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+        // GOLDEN (and all other refs) to slot 4.
+        // No update.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+        ref_frame_config->ref_idx[0] = 2;
+      }
+    } else if ((superframe_cnt - 2) % 4 == 0) {
+      // Middle temporal enhancement layer.
+      layer_id->temporal_layer_id = 1;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST.
+        // Set all buffer_idx to 0.
+        // Set GOLDEN to slot 3 and update slot 3.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->refresh[3] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+        // GOLDEN (and all other refs) to slot 3.
+        // Set LAST2 to slot 4 and update slot 4.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[2] = 4;
+        ref_frame_config->refresh[4] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+        // GOLDEN (and all other refs) to slot 4.
+        // Set LAST2 to slot 5 and update slot 5.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->ref_idx[2] = 5;
+        ref_frame_config->refresh[5] = 1;
+      }
+    } else if ((superframe_cnt - 3) % 4 == 0) {
+      // Second top temporal enhancement layer.
+      layer_id->temporal_layer_id = 2;
+      if (layer_id->spatial_layer_id == 0) {
+        // Set LAST to slot 3 and reference LAST.
+        // Set GOLDEN to slot 3 and update slot 3.
+        // Set all other buffer_idx to 0.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 3;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->refresh[3] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
+        // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 4;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->ref_idx[1] = 4;
+        ref_frame_config->refresh[4] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
+        // GOLDEN to slot 4. No update.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 5;
+        ref_frame_config->ref_idx[3] = 4;
+      }
+    }
+    if (layer_id->spatial_layer_id > 0) {
+      // Always reference GOLDEN (inter-layer prediction).
+      ref_frame_config->reference[3] = 1;
+      if (is_key_frame && layer_id->spatial_layer_id > 0) {
+        // On superframes whose base is key: remove LAST since GOLDEN
+        // is used as reference.
+        ref_frame_config->reference[0] = 0;
+      }
+    }
+  }
+
   // Layer pattern configuration.
   virtual int set_layer_pattern(
       int frame_cnt, aom_svc_layer_id_t *layer_id,
       aom_svc_ref_frame_config_t *ref_frame_config,
       aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int spatial_layer,
       int multi_ref, int comp_pred, int is_key_frame,
-      int dynamic_enable_disable_mode, int rps_mode, int rps_recovery_frame) {
+      int dynamic_enable_disable_mode, int rps_mode, int rps_recovery_frame,
+      int simulcast_mode) {
     int lag_index = 0;
     int base_count = frame_cnt >> 2;
     layer_id->spatial_layer_id = spatial_layer;
@@ -506,128 +810,21 @@
       // Reference GOLDEN.
       if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
     } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
-      // 3 spatial and 3 temporal layer.
-      // Overlap in the buffer slot updates: the slots 3 and 4 updated by
-      // first TL2 are reused for update in TL1 superframe.
-      if (superframe_cnt_ % 4 == 0) {
-        // Base temporal layer.
-        layer_id->temporal_layer_id = 0;
-        if (layer_id->spatial_layer_id == 0) {
-          // Reference LAST, update LAST.
-          // Set all buffer_idx to 0.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->refresh[0] = 1;
-        } else if (layer_id->spatial_layer_id == 1) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
-          // GOLDEN (and all other refs) to slot 0.
-          // Update slot 1 (LAST).
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->refresh[1] = 1;
-        } else if (layer_id->spatial_layer_id == 2) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
-          // GOLDEN (and all other refs) to slot 1.
-          // Update slot 2 (LAST).
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
-          ref_frame_config->ref_idx[0] = 2;
-          ref_frame_config->refresh[2] = 1;
+      if (simulcast_mode) {
+        ref_config_simulcast3SL3TL(ref_frame_config, layer_id, is_key_frame,
+                                   superframe_cnt_);
+      } else {
+        ref_config_3SL3TL(ref_frame_config, layer_id, is_key_frame,
+                          superframe_cnt_);
+        // Allow for top spatial layer to use additional temporal reference.
+        // Additional reference is only updated on base temporal layer, every
+        // 10 TL0 frames here.
+        if (multi_ref && layer_id->spatial_layer_id == 2) {
+          ref_frame_config->ref_idx[6] = 7;
+          if (!is_key_frame) ref_frame_config->reference[6] = 1;
+          if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+            ref_frame_config->refresh[7] = 1;
         }
-      } else if ((superframe_cnt_ - 1) % 4 == 0) {
-        // First top temporal enhancement layer.
-        layer_id->temporal_layer_id = 2;
-        if (layer_id->spatial_layer_id == 0) {
-          // Reference LAST (slot 0).
-          // Set GOLDEN to slot 3 and update slot 3.
-          // Set all other buffer_idx to slot 0.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 3;
-          ref_frame_config->refresh[3] = 1;
-        } else if (layer_id->spatial_layer_id == 1) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
-          // GOLDEN (and all other refs) to slot 3.
-          // Set LAST2 to slot 4 and Update slot 4.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[1] = 4;
-          ref_frame_config->refresh[4] = 1;
-        } else if (layer_id->spatial_layer_id == 2) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
-          // GOLDEN (and all other refs) to slot 4.
-          // No update.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
-          ref_frame_config->ref_idx[0] = 2;
-        }
-      } else if ((superframe_cnt_ - 2) % 4 == 0) {
-        // Middle temporal enhancement layer.
-        layer_id->temporal_layer_id = 1;
-        if (layer_id->spatial_layer_id == 0) {
-          // Reference LAST.
-          // Set all buffer_idx to 0.
-          // Set GOLDEN to slot 3 and update slot 3.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 3;
-          ref_frame_config->refresh[3] = 1;
-        } else if (layer_id->spatial_layer_id == 1) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
-          // GOLDEN (and all other refs) to slot 3.
-          // Set LAST2 to slot 4 and update slot 4.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[2] = 4;
-          ref_frame_config->refresh[4] = 1;
-        } else if (layer_id->spatial_layer_id == 2) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
-          // GOLDEN (and all other refs) to slot 4.
-          // Set LAST2 to slot 5 and update slot 5.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
-          ref_frame_config->ref_idx[0] = 2;
-          ref_frame_config->ref_idx[2] = 5;
-          ref_frame_config->refresh[5] = 1;
-        }
-      } else if ((superframe_cnt_ - 3) % 4 == 0) {
-        // Second top temporal enhancement layer.
-        layer_id->temporal_layer_id = 2;
-        if (layer_id->spatial_layer_id == 0) {
-          // Set LAST to slot 3 and reference LAST.
-          // Set GOLDEN to slot 3 and update slot 3.
-          // Set all other buffer_idx to 0.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 3;
-          ref_frame_config->ref_idx[3] = 3;
-          ref_frame_config->refresh[3] = 1;
-        } else if (layer_id->spatial_layer_id == 1) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
-          // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 4;
-          ref_frame_config->ref_idx[3] = 3;
-          ref_frame_config->ref_idx[1] = 4;
-          ref_frame_config->refresh[4] = 1;
-        } else if (layer_id->spatial_layer_id == 2) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
-          // GOLDEN to slot 4. No update.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 5;
-          ref_frame_config->ref_idx[3] = 4;
-        }
-      }
-      if (layer_id->spatial_layer_id > 0) {
-        // Always reference GOLDEN (inter-layer prediction).
-        ref_frame_config->reference[3] = 1;
-        if (is_key_frame && layer_id->spatial_layer_id > 0) {
-          // On superframes whose base is key: remove LAST since GOLDEN
-          // is used as reference.
-          ref_frame_config->reference[0] = 0;
-        }
-      }
-      // Allow for top spatial layer to use additional temporal reference.
-      // Additional reference is only updated on base temporal layer, every
-      // 10 TL0 frames here.
-      if (multi_ref && layer_id->spatial_layer_id == 2) {
-        ref_frame_config->ref_idx[6] = 7;
-        if (!is_key_frame) ref_frame_config->reference[6] = 1;
-        if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
-          ref_frame_config->refresh[7] = 1;
       }
     }
     // If the top spatial layer is first-time encoded in mid-sequence
@@ -1090,6 +1287,60 @@
     EXPECT_EQ((int)GetMismatchFrames(), 150);
   }
 
+  virtual void BasicRateTargetingSVC3TL3SLSimulcast() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.kf_max_dist = 150;
+    cfg_.kf_min_dist = 150;
+    int num_frames = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, num_frames);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    simulcast_mode_ = 1;
+    frame_to_start_decoding_ = cfg_.kf_max_dist;
+    layer_to_decode_ = 2;  // SL2
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Only SL2 layer is decoded.
+    for (int tl = 0; tl < number_temporal_layers_; tl++) {
+      int i = layer_to_decode_ * number_temporal_layers_ + tl;
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.6)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Only top spatial layer (SL2) is decoded, starting at frame 150
+    // (frame_to_start_decoding_), so there (300 - 150) / 2 = 75
+    // non-reference frames, so mismatch is 75.
+    int num_mismatch = (num_frames - frame_to_start_decoding_) / 2;
+    EXPECT_EQ((int)GetMismatchFrames(), num_mismatch);
+  }
+
   virtual void BasicRateTargetingSVC1TL2SLIntraOnlyTest() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -2133,6 +2384,7 @@
   int screen_mode_;
   int rps_mode_;
   int rps_recovery_frame_;
+  int simulcast_mode_;
 
   int user_define_frame_qp_;
   int frame_qp_;
@@ -2199,6 +2451,14 @@
   BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll();
 }
 
+// Check simulcast mode for 3 spatial layers, 3 temporal,
+// Key frame is inserted on base SLO in mid-stream, and verify that the
+// top spatial layer (SL2) case be decoded, starting with an Intra-only frame.
+// Verify that we can decode all frames for SL2 with no mismatch.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLSimulcast) {
+  BasicRateTargetingSVC3TL3SLSimulcast();
+}
+
 // Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal,
 // with Intra-only frame inserted in the stream.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SLIntraOnly) {
@@ -2373,6 +2633,39 @@
   BasicRateTargetingRPS1TL1SLDropFramesTest();
 }
 
+TEST(SvcParams, BitrateOverflow) {
+  uint8_t buf[6] = { 0 };
+  aom_image_t img;
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf));
+
+  aom_codec_iface_t *const iface = aom_codec_av1_cx();
+  EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+            AOM_CODEC_OK);
+  cfg.g_w = 1;
+  cfg.g_h = 1;
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+  aom_svc_params_t svc_params = {};
+  svc_params.framerate_factor[0] = 1;
+  svc_params.framerate_factor[1] = 2;
+  svc_params.number_spatial_layers = 1;
+  svc_params.number_temporal_layers = 2;
+  svc_params.layer_target_bitrate[0] = INT_MAX;
+  svc_params.layer_target_bitrate[1] = INT_MAX;
+  EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params),
+            AOM_CODEC_OK);
+  EXPECT_EQ(
+      aom_codec_encode(&enc, &img, /*pts=*/0, /*duration=*/1, /*flags=*/0),
+      AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_encode(&enc, /*img=*/nullptr, /*pts=*/0, /*duration=*/0,
+                             /*flags=*/0),
+            AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
 AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
                            ::testing::Values(::libaom_test::kRealTime),
                            ::testing::Range(7, 12), ::testing::Values(0, 3),
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index e689cd3..85f68b8 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -57,8 +57,8 @@
 class TemporalFilterTest
     : public ::testing::TestWithParam<TemporalFilterWithParam> {
  public:
-  virtual ~TemporalFilterTest() {}
-  virtual void SetUp() {
+  ~TemporalFilterTest() override = default;
+  void SetUp() override {
     params_ = GET_PARAM(0);
     tf_wgt_calc_lvl_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -71,7 +71,7 @@
     ASSERT_NE(src2_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src1_);
     aom_free(src2_);
   }
@@ -308,6 +308,23 @@
                                  Values(0, 1)));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+  TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+                          &av1_apply_temporal_filter_neon_dotprod)
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+                                 Values(0, 1)));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_AVX2 || HAVE_NEON
+// Width and height for which av1_estimate_noise_from_single_plane() will be
+// tested.
+const int kWidths[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
+const int kHeights[] = { 2160, 1080, 720, 600, 480, 240, 237 };
+#endif  // HAVE_AVX2 || HAVE_NEON
+
 typedef double (*EstimateNoiseFunc)(const uint8_t *src, int height, int width,
                                     int stride, int edge_thresh);
 
@@ -317,8 +334,8 @@
 class EstimateNoiseTest
     : public ::testing::TestWithParam<EstimateNoiseWithParam> {
  public:
-  virtual ~EstimateNoiseTest() {}
-  virtual void SetUp() {
+  ~EstimateNoiseTest() override = default;
+  void SetUp() override {
     ref_func = GET_PARAM(0);
     tst_func = GET_PARAM(1);
     width_ = GET_PARAM(2);
@@ -330,7 +347,7 @@
     ASSERT_NE(src1_, nullptr);
   }
 
-  virtual void TearDown() { aom_free(src1_); }
+  void TearDown() override { aom_free(src1_); }
 
   void RunTest(int run_times) {
     stride_ = width_;
@@ -387,11 +404,6 @@
 TEST_P(EstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
 
 #if HAVE_AVX2
-// Width and height for which av1_estimate_noise_from_single_plane() will be
-// tested.
-const int kWidths[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
-const int kHeights[] = { 2160, 1080, 720, 600, 480, 240, 237 };
-
 INSTANTIATE_TEST_SUITE_P(
     AVX2, EstimateNoiseTest,
     ::testing::Combine(
@@ -400,6 +412,15 @@
         ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, EstimateNoiseTest,
+    ::testing::Combine(
+        ::testing::Values(av1_estimate_noise_from_single_plane_c),
+        ::testing::Values(av1_estimate_noise_from_single_plane_neon),
+        ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
+#endif  // HAVE_NEON
+
 #if CONFIG_AV1_HIGHBITDEPTH
 
 typedef void (*HBDTemporalFilterFunc)(
@@ -416,8 +437,8 @@
 class HBDTemporalFilterTest
     : public ::testing::TestWithParam<HBDTemporalFilterWithParam> {
  public:
-  virtual ~HBDTemporalFilterTest() {}
-  virtual void SetUp() {
+  ~HBDTemporalFilterTest() override = default;
+  void SetUp() override {
     params_ = GET_PARAM(0);
     tf_wgt_calc_lvl_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -430,7 +451,7 @@
     ASSERT_NE(src2_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src1_);
     aom_free(src2_);
   }
@@ -664,6 +685,104 @@
                          Combine(ValuesIn(HBDtemporal_filter_test_avx2),
                                  Values(0, 1)));
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+                             &av1_highbd_apply_temporal_filter_neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+                         Combine(ValuesIn(HBDtemporal_filter_test_neon),
+                                 Values(0, 1)));
+#endif  // HAVE_NEON
+
+using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
+                                        int width, int stride, int bit_depth,
+                                        int edge_thresh);
+
+using HBDEstimateNoiseWithParam =
+    std::tuple<HBDEstimateNoiseFunc, HBDEstimateNoiseFunc, int, int, int>;
+
+class HBDEstimateNoiseTest
+    : public ::testing::TestWithParam<HBDEstimateNoiseWithParam> {
+ public:
+  HBDEstimateNoiseTest()
+      : ref_func_(GET_PARAM(0)), tst_func_(GET_PARAM(1)),
+        rnd_(libaom_test::ACMRandom::DeterministicSeed()), width_(GET_PARAM(2)),
+        height_(GET_PARAM(3)), bitdepth_(GET_PARAM(4)) {}
+  ~HBDEstimateNoiseTest() override = default;
+  void SetUp() override {
+    src1_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * width_ * height_));
+    ASSERT_NE(src1_, nullptr);
+    GenRandomData(width_ * height_);
+  }
+
+  void TearDown() override { aom_free(src1_); }
+
+  void RunTest() {
+    stride_ = width_;
+
+    double ref_out = ref_func_(src1_, height_, width_, stride_, bitdepth_,
+                               NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+    double tst_out = tst_func_(src1_, height_, width_, stride_, bitdepth_,
+                               NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+    EXPECT_EQ(ref_out, tst_out);
+  }
+
+  void SpeedTest(int run_times) {
+    stride_ = width_;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; i++) {
+      ref_func_(src1_, height_, width_, stride_, bitdepth_,
+                NOISE_ESTIMATION_EDGE_THRESHOLD);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; i++) {
+      tst_func_(src1_, height_, width_, stride_, bitdepth_,
+                NOISE_ESTIMATION_EDGE_THRESHOLD);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%d %dx%d :%7.2f/%7.2f (%3.2f)\n", bitdepth_, width_, height_, time1,
+           time2, time1 / time2);
+  }
+
+  void GenRandomData(int size) {
+    for (int ii = 0; ii < size; ii++) src1_[ii] = rnd_.Rand12();
+  }
+
+ private:
+  HBDEstimateNoiseFunc ref_func_;
+  HBDEstimateNoiseFunc tst_func_;
+  ACMRandom rnd_;
+  uint16_t *src1_;
+  int width_;
+  int height_;
+  int stride_;
+  int bitdepth_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDEstimateNoiseTest);
+
+TEST_P(HBDEstimateNoiseTest, RandomValues) { RunTest(); }
+
+TEST_P(HBDEstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HBDEstimateNoiseTest,
+    ::testing::Combine(
+        ::testing::Values(av1_highbd_estimate_noise_from_single_plane_c),
+        ::testing::Values(av1_highbd_estimate_noise_from_single_plane_neon),
+        ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights),
+        ::testing::ValuesIn({ 8, 10, 12 })));
+#endif  // HAVE_NEON
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace
 #endif
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 4bd0ddc..4b4a96d 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -571,3 +571,5 @@
 36a4fcf07e645ed522cde5845dd9c6ab2b2d1502 *av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf.md5
 9f935d391fdf4a6f7c320355d45770d2e7d6095c *desktopqvga2.320_240.yuv
 4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
+9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
diff --git a/test/test.cmake b/test/test.cmake
index 672edb3..ce94a5a 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -157,21 +157,29 @@
               "${AOM_ROOT}/test/simd_cmp_impl.h"
               "${AOM_ROOT}/test/simd_impl.h")
 
-  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
-              "${AOM_ROOT}/test/simd_cmp_neon.cc")
-  add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_NEON)
+  if(HAVE_SSE2)
+    list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
+                "${AOM_ROOT}/test/simd_cmp_sse2.cc")
+    add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE2)
+  endif()
 
-  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
-              "${AOM_ROOT}/test/simd_cmp_sse2.cc")
-  add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE2)
+  if(HAVE_SSSE3)
+    list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
+                "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
+    add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSSE3)
+  endif()
 
-  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
-              "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
-  add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSSE3)
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1
+                "${AOM_ROOT}/test/simd_cmp_sse4.cc")
+    add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1)
+  endif()
 
-  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
-              "${AOM_ROOT}/test/simd_cmp_avx2.cc")
-  add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_AVX2)
+  if(HAVE_AVX2)
+    list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
+                "${AOM_ROOT}/test/simd_cmp_avx2.cc")
+    add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_AVX2)
+  endif()
 
   list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
               "${AOM_ROOT}/test/arf_freq_test.cc"
@@ -180,6 +188,7 @@
               "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_k_means_test.cc"
               "${AOM_ROOT}/test/av1_nn_predict_test.cc"
               "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
               "${AOM_ROOT}/test/av1_softmax_test.cc"
@@ -192,15 +201,16 @@
               "${AOM_ROOT}/test/comp_avg_pred_test.cc"
               "${AOM_ROOT}/test/comp_avg_pred_test.h"
               "${AOM_ROOT}/test/comp_mask_pred_test.cc"
+              "${AOM_ROOT}/test/disflow_test.cc"
               "${AOM_ROOT}/test/encodemb_test.cc"
               "${AOM_ROOT}/test/encodetxb_test.cc"
               "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
               "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
               "${AOM_ROOT}/test/error_block_test.cc"
+              "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
               "${AOM_ROOT}/test/firstpass_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
-              "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/hadamard_test.cc"
               "${AOM_ROOT}/test/horver_correlation_test.cc"
               "${AOM_ROOT}/test/masked_sad_test.cc"
@@ -212,23 +222,17 @@
               "${AOM_ROOT}/test/obmc_sad_test.cc"
               "${AOM_ROOT}/test/obmc_variance_test.cc"
               "${AOM_ROOT}/test/pickrst_test.cc"
+              "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/sad_test.cc"
               "${AOM_ROOT}/test/subtract_test.cc"
-              "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/sum_squares_test.cc"
               "${AOM_ROOT}/test/sse_sum_test.cc"
               "${AOM_ROOT}/test/variance_test.cc"
-              "${AOM_ROOT}/test/wiener_test.cc"
-              "${AOM_ROOT}/test/frame_error_test.cc"
               "${AOM_ROOT}/test/warp_filter_test.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.h"
               "${AOM_ROOT}/test/webmenc_test.cc"
-              "${AOM_ROOT}/test/av1_k_means_test.cc")
-
-  list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
-              "${AOM_ROOT}/test/simd_cmp_sse4.cc")
-  add_to_libaom_test_srcs(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
+              "${AOM_ROOT}/test/wiener_test.cc")
 
   if(NOT CONFIG_REALTIME_ONLY)
     list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
@@ -293,11 +297,6 @@
     endif()
   endif()
 
-  if(HAVE_NEON)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/simd_neon_test.cc")
-  endif()
-
   if(CONFIG_FPMT_TEST AND (NOT CONFIG_REALTIME_ONLY))
     list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
                 "${AOM_ROOT}/test/frame_parallel_enc_test.cc")
@@ -313,7 +312,7 @@
                 "${AOM_ROOT}/test/simd_ssse3_test.cc")
   endif()
 
-  if(HAVE_SSE4)
+  if(HAVE_SSE4_1)
     list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
                 "${AOM_ROOT}/test/simd_sse4_test.cc")
   endif()
@@ -351,13 +350,13 @@
                 "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
                 "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
                 "${AOM_ROOT}/test/intra_edge_test.cc")
-
   endif()
 
   if(HAVE_NEON)
     list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
                 "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
-                "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc")
+                "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+                "${AOM_ROOT}/test/intra_edge_test.cc")
   endif()
 
   if(HAVE_SSE4_2 OR HAVE_ARM_CRC32)
@@ -366,10 +365,10 @@
 
   if(CONFIG_REALTIME_ONLY)
     list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/disflow_test.cc"
                      "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
                      "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
                      "${AOM_ROOT}/test/firstpass_test.cc"
-                     "${AOM_ROOT}/test/frame_error_test.cc"
                      "${AOM_ROOT}/test/motion_vector_test.cc"
                      "${AOM_ROOT}/test/obmc_sad_test.cc"
                      "${AOM_ROOT}/test/obmc_variance_test.cc"
@@ -498,9 +497,6 @@
 
   target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest)
 
-  if(CONFIG_LIBYUV)
-    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
-  endif()
   if(CONFIG_WEBM_IO)
     target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:webm>)
   endif()
@@ -530,10 +526,6 @@
     add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
   endif()
-  if(HAVE_ARM_CRC32)
-    add_intrinsics_source_to_target("${AOM_ARM_CRC32_FLAG}" "test_libaom"
-                                    "AOM_UNIT_TEST_COMMON_INTRIN_CRC32")
-  endif()
 
   if(ENABLE_TESTDATA)
     make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
diff --git a/test/test_data_util.cmake b/test/test_data_util.cmake
index de7d153..069e1ad 100644
--- a/test/test_data_util.cmake
+++ b/test/test_data_util.cmake
@@ -39,7 +39,9 @@
             "vase10x10.yuv"
             "vase10x10_tiles.txt"
             "bus_352x288_420_f20_b8.yuv"
-            "test_input_w1h1.yuv")
+            "test_input_w1h1.yuv"
+            "crowd_run_360p_10_150f.y4m"
+            "wikipedia_420_360p_60f.y4m")
 
 if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
   list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
diff --git a/test/test_libaom.cc b/test/test_libaom.cc
index 6ffbbc5..fbd7f2e 100644
--- a/test/test_libaom.cc
+++ b/test/test_libaom.cc
@@ -9,24 +9,29 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <string.h>
-
-#include <string>
-
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/aom_config.h"
 
+#if !CONFIG_SHARED
+#include <string.h>
+
+#include <string>
+
+#if AOM_ARCH_ARM
+#include "aom_ports/arm.h"
+#endif
 #if AOM_ARCH_X86 || AOM_ARCH_X86_64
 #include "aom_ports/x86.h"
 #endif
+
 extern "C" {
 extern void av1_rtcd();
 extern void aom_dsp_rtcd();
 extern void aom_scale_rtcd();
 }
 
-#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+#if AOM_ARCH_ARM || AOM_ARCH_X86 || AOM_ARCH_X86_64
 static void append_negative_gtest_filter(const char *str) {
   std::string flag_value = GTEST_FLAG_GET(filter);
   // Negative patterns begin with one '-' followed by a ':' separated list.
@@ -44,11 +49,24 @@
   }
   GTEST_FLAG_SET(filter, flag_value);
 }
-#endif  // AOM_ARCH_X86 || AOM_ARCH_X86_64
+#endif  // AOM_ARCH_ARM || AOM_ARCH_X86 || AOM_ARCH_X86_64
+#endif  // !CONFIG_SHARED
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
+#if !CONFIG_SHARED
+#if AOM_ARCH_AARCH64
+  const int caps = aom_arm_cpu_caps();
+  if (!(caps & HAS_ARM_CRC32)) append_negative_gtest_filter("ARM_CRC32");
+  if (!(caps & HAS_NEON_DOTPROD)) append_negative_gtest_filter("NEON_DOTPROD");
+  if (!(caps & HAS_NEON_I8MM)) append_negative_gtest_filter("NEON_I8MM");
+  if (!(caps & HAS_SVE)) append_negative_gtest_filter("SVE");
+#elif AOM_ARCH_ARM
+  const int caps = aom_arm_cpu_caps();
+  if (!(caps & HAS_NEON)) append_negative_gtest_filter("NEON");
+#endif  // AOM_ARCH_ARM
+
 #if AOM_ARCH_X86 || AOM_ARCH_X86_64
   const int simd_caps = x86_simd_caps();
   if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX");
@@ -62,9 +80,8 @@
   if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2");
 #endif  // AOM_ARCH_X86 || AOM_ARCH_X86_64
 
-// Shared library builds don't support whitebox tests that exercise internal
-// symbols.
-#if !CONFIG_SHARED
+  // Shared library builds don't support whitebox tests that exercise internal
+  // symbols.
   av1_rtcd();
   aom_dsp_rtcd();
   aom_scale_rtcd();
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 87155c7..39414e3 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -41,7 +41,7 @@
  protected:
   TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) {}
 
-  virtual ~TestVectorTest() {
+  ~TestVectorTest() override {
     if (md5_file_) fclose(md5_file_);
   }
 
@@ -51,14 +51,13 @@
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void PreDecodeFrameHook(
-      const libaom_test::CompressedVideoSource &video,
-      libaom_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+                          libaom_test::Decoder *decoder) override {
     if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_);
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const aom_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
index 517d54b..e2ac592 100644
--- a/test/tile_config_test.cc
+++ b/test/tile_config_test.cc
@@ -82,9 +82,9 @@
     max_tile_cols_log2_ = tile_log2(1, AOM_MAX_TILE_COLS);
     max_tile_rows_log2_ = tile_log2(1, AOM_MAX_TILE_ROWS);
   }
-  virtual ~UniformTileConfigTestLarge() {}
+  ~UniformTileConfigTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -93,10 +93,10 @@
     cfg_.g_lag_in_frames = 19;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_TILE_COLUMNS, tile_config_param_.tile_cols);
       encoder->Control(AV1E_SET_TILE_ROWS, tile_config_param_.tile_rows);
@@ -109,8 +109,8 @@
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -148,9 +148,9 @@
         tile_config_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
     tile_config_violated_ = false;
   }
-  virtual ~NonUniformTileConfigTestLarge() {}
+  ~NonUniformTileConfigTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -168,10 +168,10 @@
                tile_config_param_.tile_height_count);
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -182,8 +182,8 @@
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
@@ -302,9 +302,9 @@
         tile_group_config_params_(GET_PARAM(2)) {
     tile_group_config_violated_ = false;
   }
-  virtual ~TileGroupTestLarge() {}
+  ~TileGroupTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
@@ -312,10 +312,10 @@
     cfg_.g_threads = 1;
   }
 
-  virtual bool DoDecode() const { return 1; }
+  bool DoDecode() const override { return true; }
 
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AV1E_SET_NUM_TG, tile_group_config_params_.num_tg);
@@ -326,8 +326,8 @@
     }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
+  bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                          libaom_test::Decoder *decoder) override {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     if (AOM_CODEC_OK == res_dec) {
       aom_tile_info tile_info;
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 888c3ab..84406dd 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -47,15 +47,15 @@
     }
   }
 
-  virtual ~TileIndependenceTest() {
+  ~TileIndependenceTest() override {
     delete fw_dec_;
     delete inv_dec_;
   }
 
-  virtual void SetUp() { InitializeConfig(libaom_test::kTwoPassGood); }
+  void SetUp() override { InitializeConfig(libaom_test::kTwoPassGood); }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
       encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
@@ -82,7 +82,7 @@
     md5->Add(img);
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
     UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
@@ -123,7 +123,7 @@
 }
 
 class TileIndependenceTestLarge : public TileIndependenceTest {
-  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+  void SetCpuUsed(libaom_test::Encoder *encoder) override {
     static const int kCpuUsed = 0;
     encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
   }
diff --git a/test/time_stamp_test.cc b/test/time_stamp_test.cc
index baa0dc0..5de98b7 100644
--- a/test/time_stamp_test.cc
+++ b/test/time_stamp_test.cc
@@ -47,16 +47,16 @@
            (static_cast<double>(framerate_numerator_) / framerate_denominator_);
   }
 
-  virtual aom_codec_pts_t pts() const {
+  aom_codec_pts_t pts() const override {
     return static_cast<aom_codec_pts_t>(frame_ * FrameDuration() +
                                         starting_pts_ + 0.5);
   }
 
-  virtual unsigned long duration() const {
+  unsigned long duration() const override {
     return static_cast<unsigned long>(FrameDuration() + 0.5);
   }
 
-  virtual aom_rational_t timebase() const { return timebase_; }
+  aom_rational_t timebase() const override { return timebase_; }
 
   void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
 
@@ -72,9 +72,9 @@
       public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
  protected:
   TimestampTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~TimestampTest() {}
+  ~TimestampTest() override = default;
 
-  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+  void SetUp() override { InitializeConfig(GET_PARAM(1)); }
 };
 
 // Tests encoding in millisecond timebase.
diff --git a/test/tools_common.sh b/test/tools_common.sh
index f2d1802..cb9eba1 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -312,7 +312,11 @@
   # Combine environment and actual tests.
   local tests_to_run="${env_tests} ${tests_to_filter}"
 
-  check_version_strings
+  # av1_c_vs_simd_encode is a standalone test, and it doesn't need to check the
+  # version string.
+  if [ "${test_name}" != "av1_c_vs_simd_encode" ]; then
+    check_version_strings
+  fi
 
   # Run tests.
   for test in ${tests_to_run}; do
@@ -464,6 +468,8 @@
 
 AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no}
 
+# This checking requires config/aom_config.c that is available in Jenkins
+# testing.
 if [ "$(is_windows_target)" = "yes" ]; then
   AOM_TEST_EXE_SUFFIX=".exe"
 fi
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 260f4ff..55e78fe 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -12,11 +12,12 @@
 #ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
 #define AOM_TEST_TRANSFORM_TEST_BASE_H_
 
-#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "aom_mem/aom_mem.h"
 #include "aom/aom_codec.h"
 #include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "test/acm_random.h"
 
 namespace libaom_test {
 
@@ -40,7 +41,7 @@
 template <typename OutType>
 class TransformTestBase {
  public:
-  virtual ~TransformTestBase() {}
+  virtual ~TransformTestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(const int16_t *in, OutType *out, int stride) = 0;
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2863aea..adca1b1 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -348,7 +348,7 @@
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() {}
+  ~SumOfSquaresTest() override = default;
 
  protected:
   void ConstTest();
@@ -427,7 +427,7 @@
 class MseWxHTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -439,7 +439,7 @@
     ASSERT_NE(dst_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dst_);
     src_ = nullptr;
@@ -528,7 +528,7 @@
   // Memory required to compute mse of two 8x8 and four 4x4 blocks assigned for
   // maximum width 16 and maximum height 8.
   int mem_size = 16 * 8;
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint16_t *>(
@@ -539,7 +539,7 @@
     ASSERT_NE(dst_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dst_);
     src_ = nullptr;
@@ -659,7 +659,7 @@
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -678,7 +678,7 @@
     }
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
       src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
@@ -1286,7 +1286,7 @@
 class SubpelVarianceTest
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -1308,7 +1308,7 @@
     ASSERT_NE(ref_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (!use_high_bit_depth()) {
       aom_free(src_);
       aom_free(ref_);
@@ -1544,7 +1544,7 @@
 class ObmcVarianceTest
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -1553,7 +1553,7 @@
           aom_memalign(32, block_size() + width() + height() + 1));
     } else {
       pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign(
-          32, block_size() + width() + height() + 1 * sizeof(uint16_t))));
+          32, (block_size() + width() + height() + 1) * sizeof(uint16_t))));
     }
     wsrc_ = reinterpret_cast<int32_t *>(
         aom_memalign(32, block_size() * sizeof(uint32_t)));
@@ -1564,7 +1564,7 @@
     ASSERT_NE(mask_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (!use_high_bit_depth()) {
       aom_free(pre_);
     } else {
@@ -1635,7 +1635,8 @@
         memset(pre_ + half, 0, half + width() + height() + 1);
       } else {
         aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
-        aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half);
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0,
+                     half + width() + height() + 1);
       }
       for (int j = 0; j < half; j++) {
         wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
@@ -1962,7 +1963,7 @@
 class MseHBDWxHTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -1974,7 +1975,7 @@
     ASSERT_NE(dst_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_);
     aom_free(dst_);
     src_ = nullptr;
@@ -2062,6 +2063,8 @@
 typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+    AvxHBDDistWtdSubpelAvgVarianceTest;
 #if !CONFIG_REALTIME_ONLY
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
 #endif
@@ -2081,6 +2084,12 @@
 TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+#if !CONFIG_REALTIME_ONLY
+TEST_P(AvxHBDObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+#endif
 
 INSTANTIATE_TEST_SUITE_P(
     C, MseHBDWxHTest,
@@ -2106,6 +2115,14 @@
 
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
+    NEON, MseHBDWxHTest,
+    ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_neon, 10),
+                      MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_neon, 10),
+                      MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_neon, 10),
+                      MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_neon,
+                                      10)));
+
+INSTANTIATE_TEST_SUITE_P(
     NEON, AvxHBDMseTest,
     ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_neon, 12),
                       MseParams(4, 3, &aom_highbd_12_mse16x8_neon, 12),
@@ -2121,6 +2138,15 @@
                       MseParams(3, 3, &aom_highbd_8_mse8x8_neon, 8)));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, AvxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &aom_highbd_8_mse16x16_neon_dotprod, 8),
+                      MseParams(4, 3, &aom_highbd_8_mse16x8_neon_dotprod, 8),
+                      MseParams(3, 4, &aom_highbd_8_mse8x16_neon_dotprod, 8),
+                      MseParams(3, 3, &aom_highbd_8_mse8x8_neon_dotprod, 8)));
+#endif  // HAVE_NEON_DOTPROD
+
 const VarianceParams kArrayHBDVariance_c[] = {
   VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
   VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
@@ -2389,27 +2415,177 @@
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 
+const DistWtdSubpelAvgVarianceParams kArrayHBDDistWtdSubpelAvgVariance_c[] = {
+  DistWtdSubpelAvgVarianceParams(
+      7, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      7, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      6, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      4, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      3, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      3, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      3, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      2, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      2, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      7, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      7, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      6, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      4, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      3, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      3, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      3, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      2, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      2, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      7, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      7, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      6, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      4, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      3, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      3, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      3, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      2, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      2, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      5, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      3, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      4, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      2, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c, 8),
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      5, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      3, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      4, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      2, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c, 10),
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      5, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      3, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      4, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c, 12),
+  DistWtdSubpelAvgVarianceParams(
+      2, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxHBDDistWtdSubpelAvgVarianceTest,
+    ::testing::ValuesIn(kArrayHBDDistWtdSubpelAvgVariance_c));
+
 #if !CONFIG_REALTIME_ONLY
 const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
-  ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
+  ObmcSubpelVarianceParams(7, 7, &aom_highbd_8_obmc_sub_pixel_variance128x128_c,
                            8),
-  ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c,
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_8_obmc_sub_pixel_variance128x64_c,
                            8),
-  ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c,
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_8_obmc_sub_pixel_variance64x128_c,
                            8),
-  ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8),
-  ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8),
-  ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8),
-  ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8),
-  ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8),
-  ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8),
-  ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8),
-  ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8),
-  ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8),
-  ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8),
-  ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8),
-  ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8),
-  ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_8_obmc_sub_pixel_variance64x64_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_8_obmc_sub_pixel_variance64x32_c,
+                           8),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_8_obmc_sub_pixel_variance32x64_c,
+                           8),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_8_obmc_sub_pixel_variance32x32_c,
+                           8),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_8_obmc_sub_pixel_variance32x16_c,
+                           8),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_8_obmc_sub_pixel_variance16x32_c,
+                           8),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_8_obmc_sub_pixel_variance16x16_c,
+                           8),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_8_obmc_sub_pixel_variance16x8_c,
+                           8),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_8_obmc_sub_pixel_variance8x16_c,
+                           8),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_8_obmc_sub_pixel_variance8x8_c, 8),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_8_obmc_sub_pixel_variance8x4_c, 8),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_8_obmc_sub_pixel_variance4x8_c, 8),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_8_obmc_sub_pixel_variance4x4_c, 8),
   ObmcSubpelVarianceParams(7, 7,
                            &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10),
   ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c,
@@ -2475,12 +2651,18 @@
   ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
                            12),
 
-  ObmcSubpelVarianceParams(6, 4, &aom_highbd_obmc_sub_pixel_variance64x16_c, 8),
-  ObmcSubpelVarianceParams(4, 6, &aom_highbd_obmc_sub_pixel_variance16x64_c, 8),
-  ObmcSubpelVarianceParams(5, 3, &aom_highbd_obmc_sub_pixel_variance32x8_c, 8),
-  ObmcSubpelVarianceParams(3, 5, &aom_highbd_obmc_sub_pixel_variance8x32_c, 8),
-  ObmcSubpelVarianceParams(4, 2, &aom_highbd_obmc_sub_pixel_variance16x4_c, 8),
-  ObmcSubpelVarianceParams(2, 4, &aom_highbd_obmc_sub_pixel_variance4x16_c, 8),
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_8_obmc_sub_pixel_variance64x16_c,
+                           8),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_8_obmc_sub_pixel_variance16x64_c,
+                           8),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_8_obmc_sub_pixel_variance32x8_c,
+                           8),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_8_obmc_sub_pixel_variance8x32_c,
+                           8),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_8_obmc_sub_pixel_variance16x4_c,
+                           8),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_8_obmc_sub_pixel_variance4x16_c,
+                           8),
   ObmcSubpelVarianceParams(6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_c,
                            10),
   ObmcSubpelVarianceParams(4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_c,
@@ -2772,6 +2954,12 @@
   VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
   VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
   VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_avx2, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_avx2, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_avx2, 10),
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_avx2, 10),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
@@ -3247,6 +3435,16 @@
                       MseWxHParams(2, 3, &aom_mse_wxh_16bit_neon, 8),
                       MseWxHParams(2, 2, &aom_mse_wxh_16bit_neon, 8)));
 
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Mse16xHTest,
+    ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_neon, 8),
+                      Mse16xHParams(3, 2, &aom_mse_16xh_16bit_neon, 8),
+                      Mse16xHParams(2, 3, &aom_mse_16xh_16bit_neon, 8),
+                      Mse16xHParams(2, 2, &aom_mse_16xh_16bit_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_neon));
+
 INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
                          ::testing::Values(MseParams(3, 3, &aom_mse8x8_neon),
                                            MseParams(3, 4, &aom_mse8x16_neon),
@@ -3342,6 +3540,52 @@
 INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelAvgVarianceTest,
                          ::testing::ValuesIn(kArraySubpelAvgVariance_neon));
 
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_neon[] = {
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_neon, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_neon, 0),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvxDistWtdSubpelAvgVarianceTest,
+    ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_neon));
+
 #if !CONFIG_REALTIME_ONLY
 const ObmcSubpelVarianceParams kArrayObmcSubpelVariance_neon[] = {
   ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_neon, 0),
@@ -3463,8 +3707,559 @@
 
 INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_neon));
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_neon[] = {
+  SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_neon, 12),
+  SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_neon, 12),
+  SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_neon, 12),
+  SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_neon, 12),
+  SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_neon, 12),
+  SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_neon, 12),
+  SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_neon, 12),
+  SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_neon, 12),
+  SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_neon, 12),
+  SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_neon, 12),
+  SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_neon, 12),
+  SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_neon, 12),
+  SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_neon, 12),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_neon, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_neon, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_neon, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_neon, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_neon, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_neon, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_neon, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_neon, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_neon, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_neon, 10),
+  SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_neon, 10),
+  SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_neon, 10),
+  SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_neon, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_neon, 8),
+  SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_neon, 8),
+  SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_neon, 8),
+  SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_neon, 8),
+  SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_neon, 8),
+  SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_neon, 8),
+  SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_neon, 8),
+  SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_neon, 8),
+  SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_neon, 8),
+  SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_neon, 8),
+  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_neon, 8),
+  SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_neon, 8),
+  SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_neon, 8),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_neon, 8),
+  SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_neon, 8),
+  SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_neon, 8),
+  SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_neon, 8),
+  SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_neon, 8),
+  SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_neon, 8),
+  SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_neon, 10),
+  SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_neon, 10),
+  SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_neon, 10),
+  SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_neon, 10),
+  SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_neon, 10),
+  SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_neon, 10),
+  SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_neon, 12),
+  SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_neon, 12),
+  SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_neon, 12),
+  SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_neon, 12),
+  SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_neon, 12),
+  SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_neon, 12),
+#endif  //! CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_neon));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_neon[] = {
+  SubpelAvgVarianceParams(7, 7,
+                          &aom_highbd_8_sub_pixel_avg_variance128x128_neon, 8),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_neon,
+                          8),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_neon,
+                          8),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_neon,
+                          8),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_neon,
+                          8),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_neon,
+                          8),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_neon,
+                          8),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_neon,
+                          8),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_neon,
+                          8),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_neon,
+                          8),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_neon,
+                          8),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_neon,
+                          8),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_neon,
+                          8),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_neon,
+                          8),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_neon,
+                          8),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_neon,
+                          8),
+  SubpelAvgVarianceParams(
+      7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_neon, 10),
+  SubpelAvgVarianceParams(7, 6,
+                          &aom_highbd_10_sub_pixel_avg_variance128x64_neon, 10),
+  SubpelAvgVarianceParams(6, 7,
+                          &aom_highbd_10_sub_pixel_avg_variance64x128_neon, 10),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_neon,
+                          10),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_neon,
+                          10),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_neon,
+                          10),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_neon,
+                          10),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_neon,
+                          10),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_neon,
+                          10),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_neon,
+                          10),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_neon,
+                          10),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_neon,
+                          10),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_neon,
+                          10),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_neon,
+                          10),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_neon,
+                          10),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_neon,
+                          10),
+  SubpelAvgVarianceParams(
+      7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_neon, 12),
+  SubpelAvgVarianceParams(7, 6,
+                          &aom_highbd_12_sub_pixel_avg_variance128x64_neon, 12),
+  SubpelAvgVarianceParams(6, 7,
+                          &aom_highbd_12_sub_pixel_avg_variance64x128_neon, 12),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_neon,
+                          12),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_neon,
+                          12),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_neon,
+                          12),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_neon,
+                          12),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_neon,
+                          12),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_neon,
+                          12),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_neon,
+                          12),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_neon,
+                          12),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_neon,
+                          12),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_neon,
+                          12),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_neon,
+                          12),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_neon,
+                          12),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_neon,
+                          12),
+
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_neon,
+                          8),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_neon,
+                          8),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_neon,
+                          8),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_neon,
+                          8),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_neon,
+                          8),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_neon,
+                          8),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_neon,
+                          10),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_neon,
+                          10),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_neon,
+                          10),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_neon,
+                          10),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_neon,
+                          10),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_neon,
+                          10),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_neon,
+                          12),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_neon,
+                          12),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_neon,
+                          12),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_neon,
+                          12),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_neon,
+                          12),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_neon,
+                          12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_neon));
+
+const DistWtdSubpelAvgVarianceParams
+    kArrayHBDDistWtdSubpelAvgVariance_neon[] = {
+      DistWtdSubpelAvgVarianceParams(
+          7, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          7, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          6, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          6, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          6, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          5, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          5, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          5, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          4, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          4, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          4, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          3, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          3, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          3, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          2, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          2, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          7, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          7, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          6, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          6, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          6, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          5, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          5, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          5, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          4, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          4, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          4, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          3, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          3, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          3, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          2, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          2, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          7, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          7, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          6, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          6, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          6, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          5, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          5, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          5, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          4, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          4, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          4, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          3, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          3, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          3, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          2, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          2, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon, 12),
+
+#if !CONFIG_REALTIME_ONLY
+      DistWtdSubpelAvgVarianceParams(
+          6, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          4, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          5, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          3, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          4, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          2, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon, 8),
+      DistWtdSubpelAvgVarianceParams(
+          6, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          4, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          5, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          3, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          4, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          2, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon, 10),
+      DistWtdSubpelAvgVarianceParams(
+          6, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          4, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          5, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          3, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          4, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon, 12),
+      DistWtdSubpelAvgVarianceParams(
+          2, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon, 12),
+#endif  // !CONFIG_REALTIME_ONLY
+    };
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvxHBDDistWtdSubpelAvgVarianceTest,
+    ::testing::ValuesIn(kArrayHBDDistWtdSubpelAvgVariance_neon));
+
+#if !CONFIG_REALTIME_ONLY
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_neon[] = {
+  ObmcSubpelVarianceParams(
+      7, 7, &aom_highbd_12_obmc_sub_pixel_variance128x128_neon, 12),
+  ObmcSubpelVarianceParams(
+      7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_neon, 12),
+  ObmcSubpelVarianceParams(
+      6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_neon, 12),
+  ObmcSubpelVarianceParams(
+      6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_neon, 12),
+  ObmcSubpelVarianceParams(
+      6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_neon, 12),
+  ObmcSubpelVarianceParams(
+      5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_neon, 12),
+  ObmcSubpelVarianceParams(
+      5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_neon, 12),
+  ObmcSubpelVarianceParams(
+      5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_neon, 12),
+  ObmcSubpelVarianceParams(
+      4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_neon, 12),
+  ObmcSubpelVarianceParams(
+      4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_neon, 12),
+  ObmcSubpelVarianceParams(4, 3,
+                           &aom_highbd_12_obmc_sub_pixel_variance16x8_neon, 12),
+  ObmcSubpelVarianceParams(3, 4,
+                           &aom_highbd_12_obmc_sub_pixel_variance8x16_neon, 12),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_neon,
+                           12),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_neon,
+                           12),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_neon,
+                           12),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_neon,
+                           12),
+  ObmcSubpelVarianceParams(
+      6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_neon, 12),
+  ObmcSubpelVarianceParams(
+      4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_neon, 12),
+  ObmcSubpelVarianceParams(5, 3,
+                           &aom_highbd_12_obmc_sub_pixel_variance32x8_neon, 12),
+  ObmcSubpelVarianceParams(3, 5,
+                           &aom_highbd_12_obmc_sub_pixel_variance8x32_neon, 12),
+  ObmcSubpelVarianceParams(4, 2,
+                           &aom_highbd_12_obmc_sub_pixel_variance16x4_neon, 12),
+  ObmcSubpelVarianceParams(2, 4,
+                           &aom_highbd_12_obmc_sub_pixel_variance4x16_neon, 12),
+  ObmcSubpelVarianceParams(
+      7, 7, &aom_highbd_10_obmc_sub_pixel_variance128x128_neon, 10),
+  ObmcSubpelVarianceParams(
+      7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_neon, 10),
+  ObmcSubpelVarianceParams(
+      6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_neon, 10),
+  ObmcSubpelVarianceParams(
+      6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_neon, 10),
+  ObmcSubpelVarianceParams(
+      6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_neon, 10),
+  ObmcSubpelVarianceParams(
+      5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_neon, 10),
+  ObmcSubpelVarianceParams(
+      5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_neon, 10),
+  ObmcSubpelVarianceParams(
+      5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_neon, 10),
+  ObmcSubpelVarianceParams(
+      4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_neon, 10),
+  ObmcSubpelVarianceParams(
+      4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_neon, 10),
+  ObmcSubpelVarianceParams(4, 3,
+                           &aom_highbd_10_obmc_sub_pixel_variance16x8_neon, 10),
+  ObmcSubpelVarianceParams(3, 4,
+                           &aom_highbd_10_obmc_sub_pixel_variance8x16_neon, 10),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_neon,
+                           10),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_neon,
+                           10),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_neon,
+                           10),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_neon,
+                           10),
+  ObmcSubpelVarianceParams(
+      6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_neon, 10),
+  ObmcSubpelVarianceParams(
+      4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_neon, 10),
+  ObmcSubpelVarianceParams(5, 3,
+                           &aom_highbd_10_obmc_sub_pixel_variance32x8_neon, 10),
+  ObmcSubpelVarianceParams(3, 5,
+                           &aom_highbd_10_obmc_sub_pixel_variance8x32_neon, 10),
+  ObmcSubpelVarianceParams(4, 2,
+                           &aom_highbd_10_obmc_sub_pixel_variance16x4_neon, 10),
+  ObmcSubpelVarianceParams(2, 4,
+                           &aom_highbd_10_obmc_sub_pixel_variance4x16_neon, 10),
+  ObmcSubpelVarianceParams(
+      7, 7, &aom_highbd_8_obmc_sub_pixel_variance128x128_neon, 8),
+  ObmcSubpelVarianceParams(7, 6,
+                           &aom_highbd_8_obmc_sub_pixel_variance128x64_neon, 8),
+  ObmcSubpelVarianceParams(6, 7,
+                           &aom_highbd_8_obmc_sub_pixel_variance64x128_neon, 8),
+  ObmcSubpelVarianceParams(6, 6,
+                           &aom_highbd_8_obmc_sub_pixel_variance64x64_neon, 8),
+  ObmcSubpelVarianceParams(6, 5,
+                           &aom_highbd_8_obmc_sub_pixel_variance64x32_neon, 8),
+  ObmcSubpelVarianceParams(5, 6,
+                           &aom_highbd_8_obmc_sub_pixel_variance32x64_neon, 8),
+  ObmcSubpelVarianceParams(5, 5,
+                           &aom_highbd_8_obmc_sub_pixel_variance32x32_neon, 8),
+  ObmcSubpelVarianceParams(5, 4,
+                           &aom_highbd_8_obmc_sub_pixel_variance32x16_neon, 8),
+  ObmcSubpelVarianceParams(4, 5,
+                           &aom_highbd_8_obmc_sub_pixel_variance16x32_neon, 8),
+  ObmcSubpelVarianceParams(4, 4,
+                           &aom_highbd_8_obmc_sub_pixel_variance16x16_neon, 8),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_8_obmc_sub_pixel_variance16x8_neon,
+                           8),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_8_obmc_sub_pixel_variance8x16_neon,
+                           8),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_8_obmc_sub_pixel_variance8x8_neon,
+                           8),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_8_obmc_sub_pixel_variance8x4_neon,
+                           8),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_8_obmc_sub_pixel_variance4x8_neon,
+                           8),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_8_obmc_sub_pixel_variance4x4_neon,
+                           8),
+  ObmcSubpelVarianceParams(6, 4,
+                           &aom_highbd_8_obmc_sub_pixel_variance64x16_neon, 8),
+  ObmcSubpelVarianceParams(4, 6,
+                           &aom_highbd_8_obmc_sub_pixel_variance16x64_neon, 8),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_8_obmc_sub_pixel_variance32x8_neon,
+                           8),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_8_obmc_sub_pixel_variance8x32_neon,
+                           8),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_8_obmc_sub_pixel_variance16x4_neon,
+                           8),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_8_obmc_sub_pixel_variance4x16_neon,
+                           8),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDObmcSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_neon));
+#endif  // !CONFIG_REALTIME_ONLY
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+
+const VarianceParams kArrayVariance_neon_dotprod[] = {
+  VarianceParams(7, 7, &aom_variance128x128_neon_dotprod),
+  VarianceParams(6, 6, &aom_variance64x64_neon_dotprod),
+  VarianceParams(7, 6, &aom_variance128x64_neon_dotprod),
+  VarianceParams(6, 7, &aom_variance64x128_neon_dotprod),
+  VarianceParams(6, 6, &aom_variance64x64_neon_dotprod),
+  VarianceParams(6, 5, &aom_variance64x32_neon_dotprod),
+  VarianceParams(5, 6, &aom_variance32x64_neon_dotprod),
+  VarianceParams(5, 5, &aom_variance32x32_neon_dotprod),
+  VarianceParams(5, 4, &aom_variance32x16_neon_dotprod),
+  VarianceParams(4, 5, &aom_variance16x32_neon_dotprod),
+  VarianceParams(4, 4, &aom_variance16x16_neon_dotprod),
+  VarianceParams(4, 3, &aom_variance16x8_neon_dotprod),
+  VarianceParams(3, 4, &aom_variance8x16_neon_dotprod),
+  VarianceParams(3, 3, &aom_variance8x8_neon_dotprod),
+  VarianceParams(3, 2, &aom_variance8x4_neon_dotprod),
+  VarianceParams(2, 3, &aom_variance4x8_neon_dotprod),
+  VarianceParams(2, 2, &aom_variance4x4_neon_dotprod),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(2, 4, &aom_variance4x16_neon_dotprod),
+  VarianceParams(4, 2, &aom_variance16x4_neon_dotprod),
+  VarianceParams(3, 5, &aom_variance8x32_neon_dotprod),
+  VarianceParams(5, 3, &aom_variance32x8_neon_dotprod),
+  VarianceParams(4, 6, &aom_variance16x64_neon_dotprod),
+  VarianceParams(6, 4, &aom_variance64x16_neon_dotprod),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_neon_dotprod));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_neon_dotprod[] = {
+  GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+  GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+  GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+  GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0)
+};
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, GetSseSum8x8QuadTest,
+    ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon_dotprod));
+
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_neon_dotprod[] = {
+  GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+  GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+  GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+  GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0)
+};
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, GetSseSum16x16DualTest,
+    ::testing::ValuesIn(kArrayGetSseSum16x16Dual_neon_dotprod));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, AvxMseTest,
+    ::testing::Values(MseParams(3, 3, &aom_mse8x8_neon_dotprod),
+                      MseParams(3, 4, &aom_mse8x16_neon_dotprod),
+                      MseParams(4, 4, &aom_mse16x16_neon_dotprod),
+                      MseParams(4, 3, &aom_mse16x8_neon_dotprod)));
+
+#endif  // HAVE_NEON_DOTPROD
+
 }  // namespace
diff --git a/test/video_source.h b/test/video_source.h
index f7a8b98..9d73d7b 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -125,7 +125,7 @@
 // aom_image_t images with associated timestamps and duration.
 class VideoSource {
  public:
-  virtual ~VideoSource() {}
+  virtual ~VideoSource() = default;
 
   // Prepare the stream for reading, rewind/open as necessary.
   virtual void Begin() = 0;
@@ -160,35 +160,35 @@
     ReallocImage();
   }
 
-  virtual ~DummyVideoSource() { aom_img_free(img_); }
+  ~DummyVideoSource() override { aom_img_free(img_); }
 
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual aom_image_t *img() const {
+  aom_image_t *img() const override {
     return (frame_ < limit_) ? img_ : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual aom_codec_pts_t pts() const { return frame_; }
+  aom_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual aom_rational_t timebase() const {
+  aom_rational_t timebase() const override {
     const aom_rational_t t = { 1, 30 };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   void set_limit(unsigned int limit) { limit_ = limit; }
 
@@ -234,7 +234,7 @@
       : rnd_(seed), seed_(seed) {}
 
   // Reset the RNG to get a matching stream for the second pass
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     rnd_.Reset(seed_);
     FillFrame();
@@ -243,7 +243,7 @@
  protected:
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
-  virtual void FillFrame() {
+  void FillFrame() override {
     if (img_) {
       if (frame_ % 30 < 15)
         for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
@@ -260,7 +260,7 @@
 // decompressed images to the decoder.
 class CompressedVideoSource {
  public:
-  virtual ~CompressedVideoSource() {}
+  virtual ~CompressedVideoSource() = default;
 
   virtual void Init() = 0;
 
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 1d9dd45..f0be7d2 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -33,19 +33,21 @@
     C, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
 
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
-
-#if CONFIG_AV1_HIGHBITDEPTH
+#if CONFIG_AV1_HIGHBITDEPTH && (HAVE_SSE4_1 || HAVE_NEON)
 TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
   RunCheckOutput(std::get<4>(GET_PARAM(0)));
 }
 TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
   RunSpeedTest(std::get<4>(GET_PARAM(0)));
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH && (HAVE_SSE4_1 || HAVE_NEON)
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
+
+#if CONFIG_AV1_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest,
                          libaom_test::AV1HighbdWarpFilter::BuildParams(
                              av1_highbd_warp_affine_sse4_1));
@@ -58,7 +60,6 @@
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
 
 #if CONFIG_AV1_HIGHBITDEPTH
-
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdWarpFilterTest,
     libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
@@ -69,6 +70,24 @@
 INSTANTIATE_TEST_SUITE_P(
     NEON, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HighbdWarpFilterTest,
+    libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_neon));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+    NEON_I8MM, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon_i8mm));
+#endif  // HAVE_NEON_I8MM
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve));
+#endif  // HAVE_SVE
+
 }  // namespace
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index e42671e..470c980 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -19,9 +19,14 @@
 
 namespace libaom_test {
 
-int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
-  // 1 in 8 chance of generating zero (arbitrarily chosen)
-  if (((rnd->Rand8()) & 7) == 0) return 0;
+int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits,
+                            int rnd_gen_zeros) {
+  // Avoid accidentally generating a zero in speed tests, they are set by the
+  // is_*_zero parameters instead.
+  if (rnd_gen_zeros) {
+    // 1 in 8 chance of generating zero (arbitrarily chosen)
+    if (((rnd->Rand8()) & 7) == 0) return 0;
+  }
   // Otherwise, enerate uniform values in the range
   // [-(1 << bits), 1] U [1, 1<<bits]
   int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
@@ -33,34 +38,47 @@
                            int16_t *alpha, int16_t *beta, int16_t *gamma,
                            int16_t *delta, const int is_alpha_zero,
                            const int is_beta_zero, const int is_gamma_zero,
-                           const int is_delta_zero) {
-  while (1) {
+                           const int is_delta_zero, const int rnd_gen_zeros) {
+  while (true) {
     int rnd8 = rnd->Rand8() & 3;
-    mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
-    mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
-    mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
-             (1 << WARPEDMODEL_PREC_BITS);
-    mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+    mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6, rnd_gen_zeros);
+    mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6, rnd_gen_zeros);
+    mat[2] =
+        (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+        (1 << WARPEDMODEL_PREC_BITS);
+    mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
 
     if (rnd8 <= 1) {
       // AFFINE
-      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
-      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
-               (1 << WARPEDMODEL_PREC_BITS);
+      mat[4] =
+          random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+      mat[5] =
+          (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+          (1 << WARPEDMODEL_PREC_BITS);
     } else if (rnd8 == 2) {
       mat[4] = -mat[3];
       mat[5] = mat[2];
     } else {
-      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
-      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
-               (1 << WARPEDMODEL_PREC_BITS);
-      if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
-      if (is_beta_zero == 1) mat[3] = 0;
-      if (is_gamma_zero == 1) mat[4] = 0;
-      if (is_delta_zero == 1)
-        mat[5] = static_cast<int32_t>(
-            ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
-            (1 << WARPEDMODEL_PREC_BITS));
+      mat[4] =
+          random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+      mat[5] =
+          (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+          (1 << WARPEDMODEL_PREC_BITS);
+    }
+
+    if (is_alpha_zero == 1) {
+      mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+    }
+    if (is_beta_zero == 1) {
+      mat[3] = 0;
+    }
+    if (is_gamma_zero == 1) {
+      mat[4] = 0;
+    }
+    if (is_delta_zero == 1) {
+      mat[5] = static_cast<int32_t>(
+          ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
+          (1 << WARPEDMODEL_PREC_BITS));
     }
 
     // Calculate the derived parameters and check that they are suitable
@@ -109,11 +127,9 @@
                             ::testing::Values(0, 1), ::testing::Values(0, 1));
 }
 
-AV1WarpFilterTest::~AV1WarpFilterTest() {}
+AV1WarpFilterTest::~AV1WarpFilterTest() = default;
 void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
-void AV1WarpFilterTest::TearDown() {}
-
 void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   const int w = 128, h = 128;
   const int border = 16;
@@ -144,7 +160,7 @@
   ASSERT_NE(dsta, nullptr);
   generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
                         is_alpha_zero, is_beta_zero, is_gamma_zero,
-                        is_delta_zero);
+                        is_delta_zero, 0);
 
   for (int r = 0; r < h; ++r)
     for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
@@ -170,8 +186,8 @@
 
   aom_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
-         1000.0 * elapsed_time / num_loops);
+  printf("warp %3dx%-3d alpha=%d beta=%d gamma=%d delta=%d: %7.2f ns \n", out_w,
+         out_h, alpha, beta, gamma, delta, 1000.0 * elapsed_time / num_loops);
 }
 
 void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
@@ -221,7 +237,7 @@
       for (int sub_y = 0; sub_y < 2; ++sub_y) {
         generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
                               is_alpha_zero, is_beta_zero, is_gamma_zero,
-                              is_delta_zero);
+                              is_delta_zero, 1);
 
         for (int ii = 0; ii < 2; ++ii) {
           for (int jj = 0; jj < 5; ++jj) {
@@ -301,13 +317,11 @@
                             ::testing::Values(0, 1), ::testing::Values(0, 1));
 }
 
-AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
+AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() = default;
 void AV1HighbdWarpFilterTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdWarpFilterTest::TearDown() {}
-
 void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
   const int w = 128, h = 128;
   const int border = 16;
@@ -339,7 +353,7 @@
 
   generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
                         is_alpha_zero, is_beta_zero, is_gamma_zero,
-                        is_delta_zero);
+                        is_delta_zero, 0);
   // Generate an input block and extend its borders horizontally
   for (int r = 0; r < h; ++r)
     for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
@@ -367,7 +381,8 @@
 
   aom_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+  printf("highbd warp %3dx%-3d alpha=%d beta=%d gamma=%d delta=%d: %7.2f ns \n",
+         out_w, out_h, alpha, beta, gamma, delta,
          1000.0 * elapsed_time / num_loops);
 }
 
@@ -422,7 +437,7 @@
       for (int sub_y = 0; sub_y < 2; ++sub_y) {
         generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
                               is_alpha_zero, is_beta_zero, is_gamma_zero,
-                              is_delta_zero);
+                              is_delta_zero, 1);
         for (int ii = 0; ii < 2; ++ii) {
           for (int jj = 0; jj < 5; ++jj) {
             for (int do_average = 0; do_average <= 1; ++do_average) {
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index 583f312..364368a 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -50,10 +50,8 @@
 
 class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParams> {
  public:
-  virtual ~AV1WarpFilterTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
+  ~AV1WarpFilterTest() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(warp_affine_func test_impl);
@@ -86,10 +84,8 @@
 class AV1HighbdWarpFilterTest
     : public ::testing::TestWithParam<HighbdWarpTestParams> {
  public:
-  virtual ~AV1HighbdWarpFilterTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
+  ~AV1HighbdWarpFilterTest() override;
+  void SetUp() override;
 
  protected:
   void RunCheckOutput(highbd_warp_affine_func test_impl);
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 706e596..845abd6 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -30,19 +30,19 @@
         webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0),
         frame_sz_(0), frame_number_(0), end_of_file_(false) {}
 
-  virtual ~WebMVideoSource() {
+  ~WebMVideoSource() override {
     if (aom_ctx_->file != nullptr) fclose(aom_ctx_->file);
     webm_free(webm_ctx_);
     delete aom_ctx_;
     delete webm_ctx_;
   }
 
-  virtual void Init() {
+  void Init() override {
     ASSERT_NE(aom_ctx_, nullptr);
     ASSERT_NE(webm_ctx_, nullptr);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     ASSERT_NE(aom_ctx_, nullptr);
     ASSERT_NE(webm_ctx_, nullptr);
     aom_ctx_->file = OpenTestDataFile(file_name_);
@@ -54,7 +54,7 @@
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_number_;
     FillFrame();
   }
@@ -85,11 +85,11 @@
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const {
+  const uint8_t *cxdata() const override {
     return end_of_file_ ? nullptr : buf_;
   }
-  virtual size_t frame_size() const { return frame_sz_; }
-  virtual unsigned int frame_number() const { return frame_number_; }
+  size_t frame_size() const override { return frame_sz_; }
+  unsigned int frame_number() const override { return frame_number_; }
 
  protected:
   std::string file_name_;
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 8be6a64..7eb6372 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc
@@ -190,7 +190,7 @@
 
 class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     src_buf = (uint8_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
     ASSERT_NE(src_buf, nullptr);
@@ -204,7 +204,7 @@
     memset(buf, 0, buf_size);
     target_func_ = GET_PARAM(0);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_buf);
     aom_free(dgd_buf);
     aom_free(buf);
@@ -322,9 +322,11 @@
       buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
 
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    // Fill with alternating extreme values to maximize difference with
+    // the average.
     for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_buf[i] = 255;
-      src_buf[i] = 255;
+      dgd_buf[i] = i & 1 ? 255 : 0;
+      src_buf[i] = i & 1 ? 255 : 0;
     }
     uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
     uint8_t *src = src_buf;
@@ -389,6 +391,12 @@
                          ::testing::Values(av1_compute_stats_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, WienerTest,
+                         ::testing::Values(av1_compute_stats_neon));
+#endif  // HAVE_NEON
+
 }  // namespace wiener_lowbd
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -531,7 +539,7 @@
 
 class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     src_buf = (uint16_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
     ASSERT_NE(src_buf, nullptr);
@@ -540,7 +548,7 @@
     ASSERT_NE(dgd_buf, nullptr);
     target_func_ = GET_PARAM(0);
   }
-  virtual void TearDown() {
+  void TearDown() override {
     aom_free(src_buf);
     aom_free(dgd_buf);
   }
@@ -650,9 +658,11 @@
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = 1;
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    // Fill with alternating extreme values to maximize difference with
+    // the average.
     for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_buf[i] = ((uint16_t)1 << bit_depth) - 1;
-      src_buf[i] = ((uint16_t)1 << bit_depth) - 1;
+      dgd_buf[i] = i & 1 ? ((uint16_t)1 << bit_depth) - 1 : 0;
+      src_buf[i] = i & 1 ? ((uint16_t)1 << bit_depth) - 1 : 0;
     }
     const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
         dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
@@ -728,6 +738,11 @@
                          ::testing::Values(av1_compute_stats_highbd_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerTestHighbd,
+                         ::testing::Values(av1_compute_stats_highbd_neon));
+#endif  // HAVE_NEON
+
 // A test that reproduces b/274668506: signed integer overflow in
 // update_a_sep_sym().
 TEST(SearchWienerTest, 10bitSignedIntegerOverflowInUpdateASepSym) {
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 515a783..a4ed13f 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -66,7 +66,7 @@
  protected:
   Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
 
-  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+  ~Y4mVideoSourceTest() override { CloseSource(); }
 
   virtual void Init(const std::string &file_name, int limit) {
     file_name_ = file_name;
@@ -128,7 +128,7 @@
  protected:
   Y4mVideoWriteTest() : tmpfile_(nullptr) {}
 
-  virtual ~Y4mVideoWriteTest() {
+  ~Y4mVideoWriteTest() override {
     delete tmpfile_;
     input_file_ = nullptr;
   }
@@ -162,7 +162,7 @@
     ReplaceInputFile(tmpfile_->file());
   }
 
-  virtual void Init(const std::string &file_name, int limit) {
+  void Init(const std::string &file_name, int limit) override {
     Y4mVideoSourceTest::Init(file_name, limit);
     WriteY4mAndReadBack();
   }
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index bf65776..1369e4e 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -28,7 +28,7 @@
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
-  virtual ~Y4mVideoSource() {
+  ~Y4mVideoSource() override {
     aom_img_free(img_.get());
     CloseSource();
   }
@@ -53,33 +53,33 @@
     FillFrame();
   }
 
-  virtual void Begin() {
+  void Begin() override {
     OpenSource();
     ReadSourceToStart();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual aom_image_t *img() const {
+  aom_image_t *img() const override {
     return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual aom_codec_pts_t pts() const { return frame_; }
+  aom_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual aom_rational_t timebase() const {
+  aom_rational_t timebase() const override {
     const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void FillFrame() {
     ASSERT_NE(input_file_, nullptr);
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 1b898b5..77d5dfa 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -36,12 +36,12 @@
     SetSize(width, height, format);
   }
 
-  virtual ~YUVVideoSource() {
+  ~YUVVideoSource() override {
     aom_img_free(img_);
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_NE(input_file_, nullptr)
@@ -53,28 +53,28 @@
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual aom_image_t *img() const {
+  aom_image_t *img() const override {
     return (frame_ < limit_) ? img_ : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual aom_codec_pts_t pts() const { return frame_; }
+  aom_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual aom_rational_t timebase() const {
+  aom_rational_t timebase() const override {
     const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void SetSize(unsigned int width, unsigned int height,
                        aom_img_fmt format) {
diff --git a/third_party/fastfeat/README.libaom b/third_party/fastfeat/README.libaom
index 8aaee12..556d8b6 100644
--- a/third_party/fastfeat/README.libaom
+++ b/third_party/fastfeat/README.libaom
@@ -41,3 +41,4 @@
 Add error checking
 Add output argument to hold the scores of the detected features
 Add assertion and rewrite comparisons to appease the scan-build static analyzer
+Set output argument *ret_num_corners to -1 to signal memory allocation failure
diff --git a/third_party/fastfeat/fast.c b/third_party/fastfeat/fast.c
index a684a33..c475b4c 100644
--- a/third_party/fastfeat/fast.c
+++ b/third_party/fastfeat/fast.c
@@ -42,7 +42,21 @@
   xy* nonmax;
 
   corners = aom_fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+  if(!corners)
+  {
+    // Memory allocation failure
+    *ret_num_corners = -1;
+    return NULL;
+  }
+  // num_corners may be zero.
   scores = aom_fast9_score(im, stride, corners, num_corners, b);
+  if(!scores && num_corners > 0)
+  {
+    // Memory allocation failure
+    free(corners);
+    *ret_num_corners = -1;
+    return NULL;
+  }
   nonmax = aom_nonmax_suppression(corners, scores, num_corners, ret_scores, ret_num_corners);
 
   free(corners);
diff --git a/third_party/fastfeat/fast.h b/third_party/fastfeat/fast.h
index 7fd199f..228ba85 100644
--- a/third_party/fastfeat/fast.h
+++ b/third_party/fastfeat/fast.h
@@ -37,10 +37,14 @@
 
 int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart);
 
+// Returns NULL on memory allocation failure.
 xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
 
-int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
+// If num_corners > 0, returns NULL on memory allocation failure.
+int* aom_fast9_score(const byte* i, int stride, const xy* corners, int num_corners, int b);
 
+// Sets *ret_num_corners to -1 (and returns NULL) on memory allocation failure.
+// Sets *ret_num_corners to 0 if nothing went wrong but no corners were found.
 xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b,
                             int** ret_scores, int* ret_num_corners);
 
diff --git a/third_party/fastfeat/fast_9.c b/third_party/fastfeat/fast_9.c
index 345c37f..de55ab5 100644
--- a/third_party/fastfeat/fast_9.c
+++ b/third_party/fastfeat/fast_9.c
@@ -31,9 +31,7 @@
 // clang-format off
 /*This is mechanically generated code*/
 #include <stdlib.h>
-
-typedef struct { int x, y; } xy;
-typedef unsigned char byte;
+#include "fast.h"
 
 int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart)
 {
@@ -2988,7 +2986,7 @@
 
 
 
-int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
+int* aom_fast9_score(const byte* i, int stride, const xy* corners, int num_corners, int b)
 {
   int* scores = (int*)malloc(sizeof(int)* num_corners);
   int n;
@@ -5927,8 +5925,13 @@
       if(num_corners == rsize)
       {
         rsize*=2;
-        ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
-        if(!ret_corners) return NULL;
+        xy* new_ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
+        if(!new_ret_corners)
+        {
+          free(ret_corners);
+          return NULL;
+        }
+        ret_corners = new_ret_corners;
       }
       ret_corners[num_corners].x = x;
       ret_corners[num_corners].y = y;
diff --git a/third_party/fastfeat/nonmax.c b/third_party/fastfeat/nonmax.c
index cc0ada7..a6f7da0 100644
--- a/third_party/fastfeat/nonmax.c
+++ b/third_party/fastfeat/nonmax.c
@@ -53,9 +53,10 @@
   int point_below = 0;
 
   *ret_scores = 0;
-  *ret_num_nonmax = 0;
+  *ret_num_nonmax = -1;
   if(!(corners && scores) || num_corners < 1)
   {
+    *ret_num_nonmax = 0;
     return 0;
   }