Revert "Refactor and tidy up av1_convolve_2d_sr_neon"

This reverts commit aac4e7f3f7223108315e83f2ec1eec23bdbac02e.

This causes threaded test failures:
  AV1/AVxEncoderThreadTest.EncoderResultTest/*
  AV1/TileIndependenceLSTest.MD5Match/*
  AV1MultiThreaded/TestVectorTest.MD5Match/*

The follow up commits are also included in this revert:

Revert "Optimize av1_dist_wtd_convolve_2d_neon using SDOT instruction"

This reverts commit 147a1f52bec96c7eec86583d229ac9c5c90fe5fc.

Revert "Optimize av1_convolve_2d_sr_neon using SDOT instruction"

This reverts commit 2cc0666de131c2b44db08d0f51c07a2d7cd4073f.

Revert "Split av1_convolve_2d_sr_neon into horizontal/vertical helpers"

This reverts commit 9dfc5b34d857de0003b888594fc7dc62c8504d0b.

Change-Id: I7516a596bd9b39fa0ca70dec154aeea9dafda77f
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 40be27d..1a93f35 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -15,63 +15,29 @@
 #include <string.h>
 #include "aom_dsp/aom_dsp_common.h"
 
-// Support for xN Neon intrinsics is lacking in some compilers.
-#if defined(__arm__) || defined(_M_ARM)
-#define ARM_32_BIT
-#endif
-
-// DEFICIENT_CLANG_32_BIT includes clang-cl.
-#if defined(__clang__) && defined(ARM_32_BIT) && \
-    (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
-#define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
-#define GCC_32_BIT
-#endif
-
-#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
-
-static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
-  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
-                         vld1q_u8(ptr + 2 * 16) } };
-  return res;
-}
-
-static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+// Support for these xN intrinsics is lacking in older compilers.
+#if (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) || \
+    (defined(__GNUC__) &&                                               \
+     ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||    \
+      (defined(__clang__) && defined(__arm__) &&                        \
+       (__clang_major__ <= 6 ||                                         \
+        (defined(__ANDROID__) && __clang_major__ <= 7)))))
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
 
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+static INLINE uint16x8x4_t vld1q_u16_x4(uint16_t const *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
   return res;
 }
-
-#elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
-#if __GNUC__ < 8
-
-static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
-  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
-  return res;
-}
-
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
-  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
-                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
-  return res;
-}
-#endif  // __GNUC__ < 8
-
-#if __GNUC__ < 9
-static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
-  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
-                         vld1q_u8(ptr + 2 * 16) } };
-  return res;
-}
-#endif  // __GNUC__ < 9
-#endif  // defined(__GNUC__) && !defined(__clang__)
+#endif  // (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) ||
+        // (defined(__GNUC__) &&
+        //  ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||
+        //   (defined(__clang__) && defined(__arm__) &&
+        //    (__clang_major__ <= 6 ||
+        //     (defined(__ANDROID__) && __clang_major__ <= 7)))))
 
 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
                                      const uint8x8_t s1) {
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index ff51b6c..28009d8 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -844,152 +844,6 @@
   }
 }
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void av1_convolve_2d_sr_horiz_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
-    int im_h, const int16x8_t x_filter_s16, const int round_0) {
-  const int bd = 8;
-
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-
-  int height = im_h;
-
-  // Filter values are even, so downshift by 1 to reduce intermediate precision
-  // requirements.
-  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
-  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
-  // Dot product constants.
-  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
-  const int32x4_t correction =
-      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
-  assert(round_0 > 0);
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
-    uint8x16_t s0, s1, s2, s3;
-    int32x4_t t0, t1, t2, t3;
-    int16x4_t d0, d1, d2, d3;
-
-    do {
-      assert(height >= 4);
-
-      load_u8_8x16(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
-                               permute_tbl);
-      t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
-                               permute_tbl);
-      t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
-                               permute_tbl);
-      t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
-                               permute_tbl);
-
-      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
-      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
-      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
-      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
-
-      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-
-      do {
-        s0 = vld1q_u8(src_ptr);
-        t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
-                                 permute_tbl);
-        d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
-
-        vst1_s16(dst_ptr, d0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        height--;
-      } while (height > 0);
-    }
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t d0, d1, d2, d3;
-
-    do {
-      assert(height >= 4);
-
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      do {
-        s0 = vld1q_u8(s + 0 * src_stride);
-        s1 = vld1q_u8(s + 1 * src_stride);
-        s2 = vld1q_u8(s + 2 * src_stride);
-        s3 = vld1q_u8(s + 3 * src_stride);
-
-        d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-        d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-        d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-        d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-
-        vst1q_s16(d + 0 * dst_stride, d0);
-        vst1q_s16(d + 1 * dst_stride, d1);
-        vst1q_s16(d + 2 * dst_stride, d2);
-        vst1q_s16(d + 3 * dst_stride, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * im_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          s0 = vld1q_u8(s);
-          d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
-                                   permute_tbl, shift_round_0);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width > 0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        height--;
-      } while (height > 0);
-    }
-  }
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
 // Horizontal filtering for convolve_2d_sr for width multiple of 8
 // Processes one row at a time
 static INLINE void horiz_filter_w8_single_row(
@@ -1038,12 +892,10 @@
 
 // Horizontal filtering for convolve_2d_sr for width <= 4
 // Processes one row at a time
-static INLINE void horiz_filter_w4_single_row(const uint8_t *src_ptr,
-                                              int src_stride, int16_t *dst_ptr,
-                                              const int dst_stride, int height,
-                                              const int16x8_t x_filter,
-                                              const int16x4_t horiz_const,
-                                              const int16x4_t shift_round_0) {
+static INLINE void horiz_filter_w4_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16x8_t x_filter,
+    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
   do {
     const uint8_t *s = src_ptr;
@@ -1071,42 +923,83 @@
     int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                                      horiz_const, shift_round_0);
 
-    vst1_s16(dst_ptr, d0);
+    if (width == 4) {
+      vst1_s16(dst_ptr, d0);
+      dst_ptr += dst_stride;
+    } else if (width == 2) {
+      vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+      dst_ptr += dst_stride;
+    }
 
-    dst_ptr += dst_stride;
     src_ptr += src_stride;
     height--;
   } while (height > 0);
 }
 
-static INLINE void av1_convolve_2d_sr_horiz_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
-    int im_h, const int16x8_t x_filter_s16, const int round_0) {
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
+  int im_dst_stride;
+  int width, height;
+#if defined(__aarch64__)
+  uint8x8_t t0;
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+  const uint8_t *s;
+#endif
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
   const int bd = 8;
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
 
-  const uint8_t *src_ptr = src;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
 
-  int height = im_h;
+  int16_t *dst_ptr;
+
+  dst_ptr = im_block;
+  im_dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   // Filter values are even, so downshift by 1 to reduce intermediate precision
   // requirements.
-  const int16x8_t x_filter = vshrq_n_s16(x_filter_s16, 1);
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
-  assert(round_0 > 0);
+  assert(conv_params->round_0 > 0);
 
-  if (w == 4) {
+  if (w <= 4) {
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
-    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
 
 #if defined(__aarch64__)
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     do {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-      uint8x8_t t0, t1, t2, t3;
-      const uint8_t *s = src_ptr;
-
       assert(height >= 4);
+      s = src_ptr;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
 
       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
@@ -1119,6 +1012,10 @@
       s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
       s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
       s += 7;
 
       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
@@ -1139,44 +1036,56 @@
                              horiz_const, shift_round_0);
 
       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
-      vst1_s16((dst_ptr + 0 * dst_stride), d0);
-      vst1_s16((dst_ptr + 1 * dst_stride), d1);
-      vst1_s16((dst_ptr + 2 * dst_stride), d2);
-      vst1_s16((dst_ptr + 3 * dst_stride), d3);
-
+      if (w == 4) {
+        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      }
       src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
+      dst_ptr += 4 * im_dst_stride;
       height -= 4;
     } while (height >= 4);
 
     if (height) {
       assert(height < 4);
-      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride,
+      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
                                  height, x_filter, horiz_const, shift_round_0);
     }
-
-#else   // !defined(__aarch64__)
-    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, height,
-                               x_filter, horiz_const, shift_round_0);
-#endif  // defined(__aarch64__)
+#else
+    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                               height, x_filter, horiz_const, shift_round_0);
+#endif
 
   } else {
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
-    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
 #if defined(__aarch64__)
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    do {
+      assert(height >= 8);
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
 
-    for (; height >= 8; height -= 8) {
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          d0, d1, d2, d3, d4, d5, d6, d7;
-      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
-
-      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
@@ -1188,7 +1097,18 @@
       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-      s += 7;
+      width = w;
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
 
       do {
         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1204,26 +1124,28 @@
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        d0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
-                               horiz_const, shift_round_0);
-        d1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
-                               horiz_const, shift_round_0);
-        d2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
-                               horiz_const, shift_round_0);
-        d3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
-                               horiz_const, shift_round_0);
-        d4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
-                               horiz_const, shift_round_0);
-        d5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
-                               horiz_const, shift_round_0);
-        d6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
-                               horiz_const, shift_round_0);
-        d7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
-                               horiz_const, shift_round_0);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                 horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                 horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+                                 horiz_const, shift_round_0);
 
-        transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
 
-        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
 
         s0 = s8;
         s1 = s9;
@@ -1233,241 +1155,246 @@
         s5 = s13;
         s6 = s14;
         s += 8;
-        d += 8;
+        d_tmp += 8;
         width -= 8;
       } while (width > 0);
-
       src_ptr += 8 * src_stride;
-      dst_ptr += 8 * dst_stride;
-    }
+      dst_ptr += 8 * im_dst_stride;
+      height -= 8;
+    } while (height >= 8);
 
-    for (; height >= 4; height -= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          dd0, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
-      int16x8_t d0, d1, d2, d3;
-      uint8x8_t t0, t1, t2, t3;
+    if (height >= 4) {
+      assert(height < 8);
+      int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+          reg10, reg11, reg12, reg13, reg14;
+      int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+      int16x8_t out0, out1, out2, out3;
 
-      const uint8_t *s = src_ptr;
-      int16_t *d = dst_ptr;
-      int width = w;
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
 
       load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
-      s += 7;
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
 
       do {
         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-        s11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        s12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        s13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        s14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-        dd0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
-        dd1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
-        dd2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
-        dd3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
-        dd4 = convolve8_4x4(s4, s5, s6, s7, s8, s9, s10, s11, x_filter);
-        dd5 = convolve8_4x4(s5, s6, s7, s8, s9, s10, s11, s12, x_filter);
-        dd6 = convolve8_4x4(s6, s7, s8, s9, s10, s11, s12, s13, x_filter);
-        dd7 = convolve8_4x4(s7, s8, s9, s10, s11, s12, s13, s14, x_filter);
+        d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                           x_filter);
 
-        transpose_s16_4x8(&dd0, &dd1, &dd2, &dd3, &dd4, &dd5, &dd6, &dd7, &d0,
-                          &d1, &d2, &d3);
+        d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+                           x_filter);
 
-        d0 = vaddq_s16(d0, horiz_const);
-        d1 = vaddq_s16(d1, horiz_const);
-        d2 = vaddq_s16(d2, horiz_const);
-        d3 = vaddq_s16(d3, horiz_const);
+        d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+                           x_filter);
 
-        d0 = vqrshlq_s16(d0, shift_round_0);
-        d1 = vqrshlq_s16(d1, shift_round_0);
-        d2 = vqrshlq_s16(d2, shift_round_0);
-        d3 = vqrshlq_s16(d3, shift_round_0);
+        d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+                           x_filter);
 
-        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+        d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+                           x_filter);
 
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
+        d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+                           x_filter);
+
+        d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+                           x_filter);
+
+        d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
+                           x_filter);
+
+        transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
+                          &out2, &out3);
+
+        out0 = vaddq_s16(out0, horiz_const);
+        out0 = vqrshlq_s16(out0, shift_round_0);
+
+        out1 = vaddq_s16(out1, horiz_const);
+        out1 = vqrshlq_s16(out1, shift_round_0);
+
+        out2 = vaddq_s16(out2, horiz_const);
+        out2 = vqrshlq_s16(out2, shift_round_0);
+
+        out3 = vaddq_s16(out3, horiz_const);
+        out3 = vqrshlq_s16(out3, shift_round_0);
+
+        store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+        reg0 = reg8;
+        reg1 = reg9;
+        reg2 = reg10;
+        reg3 = reg11;
+        reg4 = reg12;
+        reg5 = reg13;
+        reg6 = reg14;
         s += 8;
-        d += 8;
+        d_tmp += 8;
         width -= 8;
       } while (width > 0);
-
       src_ptr += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
     }
 
     if (height) {
       assert(height < 4);
-      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
                                  height, x_filter, horiz_const, shift_round_0);
     }
+#else
 
-#else   // !defined(__aarch64__)
-    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
                                height, x_filter, horiz_const, shift_round_0);
-#endif  // defined(__aarch64__)
+#endif
   }
-}
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+  // vertical
+  {
+    uint8_t *dst_u8_ptr, *d_u8;
+    int16_t *v_src_ptr, *v_s;
 
-static INLINE void av1_convolve_2d_sr_vert_neon(
-    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
-    int h, const int16x8_t y_filter, ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int16_t round_bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                              (1 << (offset_bits - conv_params->round_1 - 1));
+    const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
-  const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
-                            (1 << (offset_bits - conv_params->round_1 - 1));
+    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
 
-  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
-  const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-  const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+    src_stride = im_stride;
+    v_src_ptr = im_block;
+    dst_u8_ptr = dst;
 
-  if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-    int16x8_t dd0;
-    uint8x8_t d01;
+    height = h;
+    width = w;
+
+    if (width <= 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+      int16x8_t dd0;
+      uint8x8_t d01;
 
 #if defined(__aarch64__)
-    int16x4_t s8, s9, s10, d1, d2, d3;
-    int16x8_t dd1;
-    uint8x8_t d23;
-#endif  // defined(__aarch64__)
+      int16x4_t s8, s9, s10, d1, d2, d3;
+      int16x8_t dd1;
+      uint8x8_t d23;
+#endif
 
-    int16_t *s = src_ptr;
-    uint8_t *d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      v_s = v_src_ptr;
 
-    load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-    s += (7 * src_stride);
+      __builtin_prefetch(v_s + 0 * im_stride);
+      __builtin_prefetch(v_s + 1 * im_stride);
+      __builtin_prefetch(v_s + 2 * im_stride);
+      __builtin_prefetch(v_s + 3 * im_stride);
+      __builtin_prefetch(v_s + 4 * im_stride);
+      __builtin_prefetch(v_s + 5 * im_stride);
+      __builtin_prefetch(v_s + 6 * im_stride);
+      __builtin_prefetch(v_s + 7 * im_stride);
 
-    do {
-#if defined(__aarch64__)
-      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
-      s += (4 * src_stride);
-
-      d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                  round_shift_vec, offset_const, sub_const_vec);
-      d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                  round_shift_vec, offset_const, sub_const_vec);
-      d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                  round_shift_vec, offset_const, sub_const_vec);
-      d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                  round_shift_vec, offset_const, sub_const_vec);
-
-      dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
-      dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
-
-      d01 = vqmovun_s16(dd0);
-      d23 = vqmovun_s16(dd1);
-
-      vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 1);
-      d += dst_stride;
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      h -= 4;
-#else   // !defined(__aarch64__)
-      s7 = vld1_s16(s);
-      s += src_stride;
-
-      d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                  round_shift_vec, offset_const, sub_const_vec);
-
-      dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
-      d01 = vqmovun_s16(dd0);
-
-      vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
-      d += dst_stride;
-
-      s0 = s1;
-      s1 = s2;
-      s2 = s3;
-      s3 = s4;
-      s4 = s5;
-      s5 = s6;
-      s6 = s7;
-      h--;
-#endif  // defined(__aarch64__)
-    } while (h > 0);
-  } else {
-    // if width is a multiple of 8 & height is a multiple of 4
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint8x8_t d0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10;
-    uint8x8_t d1, d2, d3;
-#endif  // defined(__aarch64__)
-
-    do {
-      int height = h;
-      int16_t *s = src_ptr;
-      uint8_t *d = dst_ptr;
-
-      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-      s += (7 * src_stride);
+      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      v_s += (7 * im_stride);
 
       do {
 #if defined(__aarch64__)
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-        s += (4 * src_stride);
+        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+        v_s += (im_stride << 2);
 
-        d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec, vec_round_bits);
-        d1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec, vec_round_bits);
-        d2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec, vec_round_bits);
-        d3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec, vec_round_bits);
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+        __builtin_prefetch(d_u8 + 1 * dst_stride);
+        __builtin_prefetch(d_u8 + 2 * dst_stride);
+        __builtin_prefetch(d_u8 + 3 * dst_stride);
 
-        vst1_u8(d, d0);
-        d += dst_stride;
-        vst1_u8(d, d1);
-        d += dst_stride;
-        vst1_u8(d, d2);
-        d += dst_stride;
-        vst1_u8(d, d3);
-        d += dst_stride;
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
+        dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
+
+        d01 = vqmovun_s16(dd0);
+        d23 = vqmovun_s16(dd1);
+
+        if ((w == 4) && (h != 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        0);  // 20 21 22 23
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        1);  // 30 31 32 33
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h != 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        0);  // 20 21
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        2);  // 30 31
+          d_u8 += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+        }
 
         s0 = s4;
         s1 = s5;
@@ -1477,16 +1404,29 @@
         s5 = s9;
         s6 = s10;
         height -= 4;
-#else   // !defined(__aarch64__)
-        s7 = vld1q_s16(s);
-        s += src_stride;
+#else
+        s7 = vld1_s16(v_s);
+        v_s += im_stride;
 
-        d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
                                     round_shift_vec, offset_const,
-                                    sub_const_vec, vec_round_bits);
+                                    sub_const_vec);
 
-        vst1_u8(d, d0);
-        d += dst_stride;
+        dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
+        d01 = vqmovun_s16(dd0);
+
+        if (w == 4) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+
+        } else if (w == 2) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+        }
 
         s0 = s1;
         s1 = s2;
@@ -1495,49 +1435,109 @@
         s4 = s5;
         s5 = s6;
         s6 = s7;
-        height--;
-#endif  // defined(__aarch64__)
+        height -= 1;
+#endif
       } while (height > 0);
+    } else {
+      // if width is a multiple of 8 & height is a multiple of 4
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t res0;
+#if defined(__aarch64__)
+      int16x8_t s8, s9, s10;
+      uint8x8_t res1, res2, res3;
+#endif
 
-      src_ptr += 8;
-      dst_ptr += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
+      do {
+        __builtin_prefetch(v_src_ptr + 0 * im_stride);
+        __builtin_prefetch(v_src_ptr + 1 * im_stride);
+        __builtin_prefetch(v_src_ptr + 2 * im_stride);
+        __builtin_prefetch(v_src_ptr + 3 * im_stride);
+        __builtin_prefetch(v_src_ptr + 4 * im_stride);
+        __builtin_prefetch(v_src_ptr + 5 * im_stride);
+        __builtin_prefetch(v_src_ptr + 6 * im_stride);
+        __builtin_prefetch(v_src_ptr + 7 * im_stride);
 
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
-  if (filter_params_x->taps > 8) {
-    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                         filter_params_x, filter_params_y, subpel_x_qn,
-                         subpel_y_qn, conv_params);
-  } else {
-    DECLARE_ALIGNED(16, int16_t,
-                    im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+        v_s = v_src_ptr;
+        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+        v_s += (7 * im_stride);
 
-    const int im_h = h + filter_params_y->taps - 1;
-    const int im_stride = MAX_SB_SIZE;
-    const int vert_offset = filter_params_y->taps / 2 - 1;
-    const int horiz_offset = filter_params_x->taps / 2 - 1;
-    const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+        d_u8 = dst_u8_ptr;
+        height = h;
 
-    const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
-    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+        do {
+#if defined(__aarch64__)
+          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+          v_s += (im_stride << 2);
 
-    av1_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w,
-                                  im_h, x_filter, conv_params->round_0);
+          __builtin_prefetch(d_u8 + 4 * dst_stride);
+          __builtin_prefetch(d_u8 + 5 * dst_stride);
+          __builtin_prefetch(d_u8 + 6 * dst_stride);
+          __builtin_prefetch(d_u8 + 7 * dst_stride);
 
-    av1_convolve_2d_sr_vert_neon(im_block, im_stride, dst, dst_stride, w, h,
-                                 y_filter, conv_params);
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          if (h != 2) {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res2);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res3);
+            d_u8 += dst_stride;
+          } else {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+          }
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+#else
+          s7 = vld1q_s16(v_s);
+          v_s += im_stride;
+
+          __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          vst1_u8(d_u8, res0);
+          d_u8 += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+#endif
+        } while (height > 0);
+        v_src_ptr += 8;
+        dst_u8_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
   }
 }
 
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index e8c2ad6..05d781e 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -230,75 +230,6 @@
   return sum;
 }
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-static INLINE int32x4_t convolve8_4_dot_s16(uint8x16_t samples,
-                                            const int8x8_t filters,
-                                            const int32x4_t correction,
-                                            const uint8x16_t range_limit,
-                                            const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
-}
-
-static INLINE int16x8_t convolve8_8_dot_s16(uint8x16_t samples,
-                                            const int8x8_t filters,
-                                            const int32x4_t correction,
-                                            const uint8x16_t range_limit,
-                                            const uint8x16x3_t permute_tbl,
-                                            const int16x8_t shift_round_0) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  return vqrshlq_s16(sum, shift_round_0);
-}
-
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE int16x4_t convolve8_4x4_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index ddf55bc..6f4f58e 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -281,108 +281,6 @@
   }
 }
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void dist_wtd_convolve_2d_horiz_neon(
-    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
-    const int16x8_t x_filter_s16, const int im_h, int w, const int round_0) {
-  const int bd = 8;
-  int16_t *dst_ptr = im_block;
-  int dst_stride = im_stride;
-  int width = w;
-  int height = im_h;
-
-  const int8x8_t x_filter = vmovn_s16(x_filter_s16);
-  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
-  // Dot product constants.
-  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 7);
-  const int32x4_t correction =
-      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
-    uint8x16_t s0, s1, s2, s3;
-    int32x4_t t0, t1, t2, t3;
-    int16x4_t d0, d1, d2, d3;
-
-    do {
-      s0 = vld1q_u8(src + 0 * src_stride);
-      s1 = vld1q_u8(src + 1 * src_stride);
-      s2 = vld1q_u8(src + 2 * src_stride);
-      s3 = vld1q_u8(src + 3 * src_stride);
-
-      t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
-                               permute_tbl);
-      t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
-                               permute_tbl);
-      t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
-                               permute_tbl);
-      t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
-                               permute_tbl);
-
-      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
-      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
-      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
-      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
-
-      vst1_s16((dst_ptr + 0 * dst_stride), d0);
-      vst1_s16((dst_ptr + 1 * dst_stride), d1);
-      vst1_s16((dst_ptr + 2 * dst_stride), d2);
-      vst1_s16((dst_ptr + 3 * dst_stride), d3);
-
-      src += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
-    const uint8_t *s;
-    int16_t *d;
-    uint8x16_t s0, s1, s2, s3;
-    int16x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst_ptr;
-
-      do {
-        s0 = vld1q_u8(s + 0 * src_stride);
-        s1 = vld1q_u8(s + 1 * src_stride);
-        s2 = vld1q_u8(s + 2 * src_stride);
-        s3 = vld1q_u8(s + 3 * src_stride);
-
-        d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-        d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-        d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-        d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
-                                 permute_tbl, shift_round_0);
-
-        vst1q_s16(d + 0 * dst_stride, d0);
-        vst1q_s16(d + 1 * dst_stride, d1);
-        vst1q_s16(d + 2 * dst_stride, d2);
-        vst1q_s16(d + 3 * dst_stride, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src += 4 * src_stride;
-      dst_ptr += 4 * dst_stride;
-      height -= 4;
-    } while (height > 0);
-  }
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
 static INLINE void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16x8_t x_filter, const int im_h, int w, const int round_0) {
@@ -627,8 +525,6 @@
   }
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE void dist_wtd_convolve_2d_vert_neon(
     int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
     ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {