Use lane-referencing intrinsics in Neon convolution kernels

The Neon convolution functions take a pointer to a filter and load
each of the (usually 8) values into separate Neon registers before
executing the series of multiply-accumulate instructions.

This patch modifies these helper functions to load all of the filter
values into a single Neon register and then access them via the lane-
referencing versions of the various multiply and multiply-accumulate
Neon instructions. This reduces register pressure and also the number
of load instructions.

Change-Id: I2c13251449113b26500517d9c6d774e189cd622e
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index f0e4bed..61c8760 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -27,20 +27,22 @@
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
                                       const int16x4_t s6, const int16x4_t s7,
-                                      const int16_t *filter) {
+                                      const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
 
-  sum = vmul_n_s16(s0, filter[0]);
-  sum = vmla_n_s16(sum, s1, filter[1]);
-  sum = vmla_n_s16(sum, s2, filter[2]);
-  sum = vmla_n_s16(sum, s5, filter[5]);
-  sum = vmla_n_s16(sum, s6, filter[6]);
-  sum = vmla_n_s16(sum, s7, filter[7]);
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
   /* filter[3] can take a max value of 128. So the max value of the result :
    * 128*255 + sum > 16 bits
    */
-  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
-  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
 
   return sum;
 }
@@ -48,21 +50,23 @@
 static INLINE uint8x8_t convolve8_horiz_8x8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
     const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
 
-  sum = vmulq_n_s16(s0, filter[0]);
-  sum = vmlaq_n_s16(sum, s1, filter[1]);
-  sum = vmlaq_n_s16(sum, s2, filter[2]);
-  sum = vmlaq_n_s16(sum, s5, filter[5]);
-  sum = vmlaq_n_s16(sum, s6, filter[6]);
-  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
   /* filter[3] can take a max value of 128. So the max value of the result :
    * 128*255 + sum > 16 bits
    */
-  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
-  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
 
   sum = vqrshlq_s16(sum, shift_round_0);
   sum = vqrshlq_s16(sum, shift_by_bits);
@@ -74,21 +78,23 @@
 static INLINE uint8x8_t convolve8_horiz_4x1(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
     const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
 
-  sum = vmul_n_s16(s0, filter[0]);
-  sum = vmla_n_s16(sum, s1, filter[1]);
-  sum = vmla_n_s16(sum, s2, filter[2]);
-  sum = vmla_n_s16(sum, s5, filter[5]);
-  sum = vmla_n_s16(sum, s6, filter[6]);
-  sum = vmla_n_s16(sum, s7, filter[7]);
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
   /* filter[3] can take a max value of 128. So the max value of the result :
    * 128*255 + sum > 16 bits
    */
-  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
-  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
 
   sum = vqrshl_s16(sum, shift_round_0);
   sum = vqrshl_s16(sum, shift_by_bits);
@@ -100,20 +106,22 @@
 static INLINE uint8x8_t convolve8_vert_8x4(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
 
-  sum = vmulq_n_s16(s0, filter[0]);
-  sum = vmlaq_n_s16(sum, s1, filter[1]);
-  sum = vmlaq_n_s16(sum, s2, filter[2]);
-  sum = vmlaq_n_s16(sum, s5, filter[5]);
-  sum = vmlaq_n_s16(sum, s6, filter[6]);
-  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
   /* filter[3] can take a max value of 128. So the max value of the result :
    * 128*255 + sum > 16 bits
    */
-  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
-  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
 
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
@@ -121,21 +129,23 @@
 static INLINE uint16x4_t convolve8_vert_4x4_s32(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
     const int32x4_t round_shift_vec, const int32x4_t offset_const,
     const int32x4_t sub_const_vec) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
   int32x4_t sum0;
   uint16x4_t res;
   const int32x4_t zero = vdupq_n_s32(0);
 
-  sum0 = vmull_n_s16(s0, y_filter[0]);
-  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
-  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
-  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
-  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
-  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
-  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
-  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+  sum0 = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, s1, y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, s2, y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, s3, y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, s4, y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, s5, y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, s6, y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, s7, y_filter_hi, 3);
 
   sum0 = vaddq_s32(sum0, offset_const);
   sum0 = vqrshlq_s32(sum0, round_shift_vec);
@@ -150,30 +160,32 @@
 static INLINE uint8x8_t convolve8_vert_8x4_s32(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-    const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
     const int32x4_t round_shift_vec, const int32x4_t offset_const,
     const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
   int32x4_t sum0, sum1;
   uint16x8_t res;
   const int32x4_t zero = vdupq_n_s32(0);
 
-  sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+  sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
 
-  sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+  sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
 
   sum0 = vaddq_s32(sum0, offset_const);
   sum1 = vaddq_s32(sum1, offset_const);
@@ -213,8 +225,9 @@
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
 
   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -622,8 +635,9 @@
 
   src -= vert_offset * src_stride;
 
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
   if (w <= 4) {
     uint8x8_t d01;
@@ -853,7 +867,7 @@
 // Processes one row at a time
 static INLINE void horiz_filter_w8_single_row(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int dst_stride, int width, int height, const int16x8_t x_filter,
     const int16x8_t horiz_const, const int16x8_t shift_round_0) {
   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
   do {
@@ -899,7 +913,7 @@
 // Processes one row at a time
 static INLINE void horiz_filter_w4_single_row(
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
-    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int dst_stride, int width, int height, const int16x8_t x_filter,
     const int16x4_t horiz_const, const int16x4_t shift_round_0) {
   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
   do {
@@ -983,16 +997,12 @@
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
   assert(conv_params->round_0 > 0);
 
@@ -1035,13 +1045,13 @@
       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                              horiz_const, shift_round_0);
-      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                              horiz_const, shift_round_0);
-      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                              horiz_const, shift_round_0);
-      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                              horiz_const, shift_round_0);
 
       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
@@ -1068,13 +1078,11 @@
     if (height) {
       assert(height < 4);
       horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
-                                 height, x_filter_tmp, horiz_const,
-                                 shift_round_0);
+                                 height, x_filter, horiz_const, shift_round_0);
     }
 #else
     horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
-                               height, x_filter_tmp, horiz_const,
-                               shift_round_0);
+                               height, x_filter, horiz_const, shift_round_0);
 #endif
 
   } else {
@@ -1135,22 +1143,22 @@
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                                  horiz_const, shift_round_0);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                                  horiz_const, shift_round_0);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                                  horiz_const, shift_round_0);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                                  horiz_const, shift_round_0);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
                                  horiz_const, shift_round_0);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                 horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                 horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+                                 horiz_const, shift_round_0);
 
         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
                           &res7);
@@ -1220,28 +1228,28 @@
         reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
         d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
-                           x_filter_tmp);
+                           x_filter);
 
         d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
-                           x_filter_tmp);
+                           x_filter);
 
         d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
-                           x_filter_tmp);
+                           x_filter);
 
         d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
-                           x_filter_tmp);
+                           x_filter);
 
         d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
-                           x_filter_tmp);
+                           x_filter);
 
         d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
-                           x_filter_tmp);
+                           x_filter);
 
         d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
-                           x_filter_tmp);
+                           x_filter);
 
         d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
-                           x_filter_tmp);
+                           x_filter);
 
         transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
                           &out2, &out3);
@@ -1279,14 +1287,12 @@
     if (height) {
       assert(height < 4);
       horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
-                                 height, x_filter_tmp, horiz_const,
-                                 shift_round_0);
+                                 height, x_filter, horiz_const, shift_round_0);
     }
 #else
 
     horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
-                               height, x_filter_tmp, horiz_const,
-                               shift_round_0);
+                               height, x_filter, horiz_const, shift_round_0);
 #endif
   }
 
@@ -1297,8 +1303,9 @@
 
     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
                               (1 << (offset_bits - conv_params->round_1 - 1));
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+    const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
         filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1574,8 +1581,6 @@
         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
         if (x_q4 & SUBPEL_MASK) {
           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
           uint8x8_t s[8], d;
           int16x8_t ss[4];
           int16x4_t t[8], tt;
@@ -1597,7 +1602,7 @@
           t[7] = vget_high_s16(ss[3]);
 
           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters, filter3, filter4);
+                           filters);
           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
         } else {
@@ -1703,8 +1708,6 @@
 
     if (y_q4 & SUBPEL_MASK) {
       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
       uint8x8_t s[8], d;
       int16x4_t t[8], tt;
 
@@ -1719,8 +1722,7 @@
       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
 
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
-                       filter3, filter4);
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
     } else {
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 27a996c..3459ebe 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -19,21 +19,19 @@
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters,
-                                    const int16x4_t filter3,
-                                    const int16x4_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
 
-  sum = vmul_lane_s16(s0, filters_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
   return sum;
 }
 
@@ -41,28 +39,24 @@
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filters,
-                                    const int16x8_t filter3,
-                                    const int16x8_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
 
-  sum = vmulq_lane_s16(s0, filters_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
   return vqrshrun_n_s16(sum, 7);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
-                                       const int16x8_t filters) {
-  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+                                       const int16x8_t filter) {
   int16x8_t ss[8];
 
   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -75,7 +69,7 @@
   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
 
   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filters, filter3, filter4);
+                     filter);
 }
 
 static INLINE uint8x8_t wiener_convolve8_vert_4x8(
@@ -93,20 +87,21 @@
   const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
   const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec = vdupq_n_s32(round_const);
+  const int16x4_t filter = vld1_s16(filter_y);
 
   ss0 = vaddq_s16(s0, s6);
   ss1 = vaddq_s16(s1, s5);
   ss2 = vaddq_s16(s2, s4);
 
-  sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
-  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+  sum0 = vmull_lane_s16(vget_low_s16(ss0), filter, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss1), filter, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(ss2), filter, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
 
-  sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
-  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+  sum1 = vmull_lane_s16(vget_high_s16(ss0), filter, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss1), filter, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(ss2), filter, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
 
   sum0 = vsubq_s32(sum0, round_vec);
   sum1 = vsubq_s32(sum1, round_vec);
@@ -143,10 +138,11 @@
 
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
   const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+  const int16x4_t filter = vld1_s16(filter_x);
 
-  sum = vmulq_n_s16(s0, filter_x[0]);
-  sum = vmlaq_n_s16(sum, s1, filter_x[1]);
-  sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+  sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
 
   /* sum from 16x8 to 2 32x4 registers */
   sum_0 = vmovl_s16(vget_low_s16(sum));
@@ -156,8 +152,8 @@
    *  then max value possible = 128*128*255 exceeding 16 bit
    */
 
-  s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
-  s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+  s3_0 = vmull_lane_s16(vget_low_s16(s3), filter, 3);
+  s3_1 = vmull_lane_s16(vget_high_s16(s3), filter, 3);
   sum_0 = vaddq_s32(sum_0, s3_0);
   sum_1 = vaddq_s32(sum_1, s3_1);
 
@@ -192,21 +188,22 @@
   const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
   const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+  const int16x4_t filter = vld1_s16(filter_x);
 
   temp0 = vadd_s16(s0, s6);
   temp1 = vadd_s16(s1, s5);
   temp2 = vadd_s16(s2, s4);
 
-  sum = vmul_n_s16(temp0, filter_x[0]);
-  sum = vmla_n_s16(sum, temp1, filter_x[1]);
-  sum = vmla_n_s16(sum, temp2, filter_x[2]);
+  sum = vmul_lane_s16(temp0, filter, 0);
+  sum = vmla_lane_s16(sum, temp1, filter, 1);
+  sum = vmla_lane_s16(sum, temp2, filter, 2);
   sum_0 = vmovl_s16(sum);
 
   /* s[3]*128 -- and filter coff max can be 128.
    * then max value possible = 128*128*255 Therefore, 32 bits are required to
    * hold the result.
    */
-  s3_0 = vmull_n_s16(s3, filter_x[3]);
+  s3_0 = vmull_lane_s16(s3, filter, 3);
   sum_0 = vaddq_s32(sum_0, s3_0);
 
   sum_0 = vaddq_s32(sum_0, round_vec_0);
@@ -218,44 +215,48 @@
   return res;
 }
 
-static INLINE int16x8_t
-convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
-                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
-                  const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
-                  const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+static INLINE int16x8_t convolve8_8x8_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+    const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x8_t sum;
-  int16x8_t res;
 
   sum = horiz_const;
-  sum = vmlaq_n_s16(sum, s0, filter[0]);
-  sum = vmlaq_n_s16(sum, s1, filter[1]);
-  sum = vmlaq_n_s16(sum, s2, filter[2]);
-  sum = vmlaq_n_s16(sum, s3, filter[3]);
-  sum = vmlaq_n_s16(sum, s4, filter[4]);
-  sum = vmlaq_n_s16(sum, s5, filter[5]);
-  sum = vmlaq_n_s16(sum, s6, filter[6]);
-  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
 
-  res = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_round_0);
 
-  return res;
+  return sum;
 }
 
-static INLINE int16x4_t
-convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-                  const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
-                  const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+static INLINE int16x4_t convolve8_4x4_s16(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
   int16x4_t sum;
+
   sum = horiz_const;
-  sum = vmla_n_s16(sum, s0, filter[0]);
-  sum = vmla_n_s16(sum, s1, filter[1]);
-  sum = vmla_n_s16(sum, s2, filter[2]);
-  sum = vmla_n_s16(sum, s3, filter[3]);
-  sum = vmla_n_s16(sum, s4, filter[4]);
-  sum = vmla_n_s16(sum, s5, filter[5]);
-  sum = vmla_n_s16(sum, s6, filter[6]);
-  sum = vmla_n_s16(sum, s7, filter[7]);
+  sum = vmla_lane_s16(sum, s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
 
   sum = vqrshl_s16(sum, shift_round_0);
 
@@ -265,20 +266,22 @@
 static INLINE uint16x4_t convolve8_4x4_s32(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
     const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
   int32x4_t sum0;
   uint16x4_t res;
   const int32x4_t zero = vdupq_n_s32(0);
 
-  sum0 = vmull_n_s16(s0, y_filter[0]);
-  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
-  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
-  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
-  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
-  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
-  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
-  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+  sum0 = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, s1, y_filter_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, s2, y_filter_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, s3, y_filter_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, s4, y_filter_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, s5, y_filter_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, s6, y_filter_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, s7, y_filter_hi, 3);
 
   sum0 = vaddq_s32(sum0, offset_const);
   sum0 = vqrshlq_s32(sum0, round_shift_vec);
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index e0b76a8..22caf83 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -317,7 +317,7 @@
 
 static INLINE void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
-    int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+    const int16x8_t x_filter, const int im_h, int w, const int round_0) {
   const int bd = 8;
   const uint8_t *s;
   int16_t *dst_ptr;
@@ -380,13 +380,13 @@
       s9 = vget_low_s16(tt2);
       s10 = vget_low_s16(tt3);
 
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                              horiz_const, shift_round_0);
-      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                              horiz_const, shift_round_0);
-      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                              horiz_const, shift_round_0);
-      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                              horiz_const, shift_round_0);
 
       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
@@ -418,7 +418,7 @@
       s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
       s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
 
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                              horiz_const, shift_round_0);
 
       vst1_s16(dst_ptr, d0);
@@ -483,22 +483,22 @@
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
                                  horiz_const, shift_round_0);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
                                  horiz_const, shift_round_0);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
                                  horiz_const, shift_round_0);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                                  horiz_const, shift_round_0);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
                                  horiz_const, shift_round_0);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                 horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                 horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+                                 horiz_const, shift_round_0);
 
         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
                           &res7);
@@ -543,8 +543,8 @@
         s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
         s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
-                                 x_filter_tmp, horiz_const, shift_round_0);
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 horiz_const, shift_round_0);
         vst1q_s16(d_tmp, res0);
 
         s += 8;
@@ -561,7 +561,7 @@
 
 static INLINE void dist_wtd_convolve_2d_vert_neon(
     int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
-    ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+    ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
   uint8_t *dst_u8_ptr, *d_u8;
   CONV_BUF_TYPE *dst_ptr, *dst;
   int16_t *src_ptr, *s;
@@ -731,21 +731,18 @@
   const int horiz_offset = filter_params_x->taps / 2 - 1;
   const int round_0 = conv_params->round_0 - 1;
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
 
   dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
-                                  x_filter_tmp, im_h, w, round_0);
+                                  x_filter, im_h, w, round_0);
 
   dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
                                  conv_params, y_filter, h, w);
@@ -892,18 +889,14 @@
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   // horizontal filter
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - horiz_offset;
 
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
 
   const uint8_t *s;
   uint8_t *d_u8;
@@ -980,20 +973,20 @@
         s9 = vget_high_s16(u0);
         s10 = vget_high_s16(u1);
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                               zero, shift_round_0);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+                               shift_round_0);
         d0 = vrshl_s16(d0, horiz_const);
         d0 = vadd_s16(d0, round_offset_vec);
-        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                               zero, shift_round_0);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, zero,
+                               shift_round_0);
         d1 = vrshl_s16(d1, horiz_const);
         d1 = vadd_s16(d1, round_offset_vec);
-        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                               zero, shift_round_0);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, zero,
+                               shift_round_0);
         d2 = vrshl_s16(d2, horiz_const);
         d2 = vadd_s16(d2, round_offset_vec);
-        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                               zero, shift_round_0);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, zero,
+                               shift_round_0);
         d3 = vrshl_s16(d3, horiz_const);
         d3 = vadd_s16(d3, round_offset_vec);
 
@@ -1073,8 +1066,8 @@
         s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
         s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                               zero, shift_round_0);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+                               shift_round_0);
         d0 = vrshl_s16(d0, horiz_const);
         d0 = vadd_s16(d0, round_offset_vec);
         s0 = s4;
@@ -1173,38 +1166,38 @@
         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 zero, shift_round_0);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, zero,
+                                 shift_round_0);
 
         res0 = vrshlq_s16(res0, horiz_const);
         res0 = vaddq_s16(res0, round_offset128);
 
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                                 zero, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, zero,
+                                 shift_round_0);
         res1 = vrshlq_s16(res1, horiz_const);
         res1 = vaddq_s16(res1, round_offset128);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                                 zero, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, zero,
+                                 shift_round_0);
         res2 = vrshlq_s16(res2, horiz_const);
         res2 = vaddq_s16(res2, round_offset128);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
                                  zero, shift_round_0);
         res3 = vrshlq_s16(res3, horiz_const);
         res3 = vaddq_s16(res3, round_offset128);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
                                  zero, shift_round_0);
         res4 = vrshlq_s16(res4, horiz_const);
         res4 = vaddq_s16(res4, round_offset128);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, zero, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                 zero, shift_round_0);
         res5 = vrshlq_s16(res5, horiz_const);
         res5 = vaddq_s16(res5, round_offset128);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, zero, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                 zero, shift_round_0);
         res6 = vrshlq_s16(res6, horiz_const);
         res6 = vaddq_s16(res6, round_offset128);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, zero, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+                                 zero, shift_round_0);
         res7 = vrshlq_s16(res7, horiz_const);
         res7 = vaddq_s16(res7, round_offset128);
 
@@ -1293,8 +1286,8 @@
         s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
         s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
 
-        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
-                                 x_filter_tmp, zero, shift_round_0);
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 zero, shift_round_0);
 
         res0 = vrshlq_s16(res0, horiz_const);
         res0 = vaddq_s16(res0, round_offset128);
@@ -1352,18 +1345,14 @@
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
   // vertical filter
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - (vert_offset * src_stride);
 
-  int16_t y_filter_tmp[8];
-  int16x8_t filter_y_coef = vld1q_s16(y_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
-  vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
 
   const uint8_t *s;
   uint8_t *d_u8;
@@ -1441,17 +1430,17 @@
         s9 = vget_low_s16(u1);
         s10 = vget_high_s16(u1);
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                               zero, shift_vec);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                               shift_vec);
         d0 = vadd_s16(d0, round_offset64);
-        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
-                               zero, shift_vec);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+                               shift_vec);
         d1 = vadd_s16(d1, round_offset64);
-        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
-                               zero, shift_vec);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+                               shift_vec);
         d2 = vadd_s16(d2, round_offset64);
-        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
-                               zero, shift_vec);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, zero,
+                               shift_vec);
         d3 = vadd_s16(d3, round_offset64);
 
         if (conv_params->do_average) {
@@ -1504,8 +1493,8 @@
         u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
         s7 = vget_low_s16(u0);
 
-        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                               zero, shift_vec);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                               shift_vec);
 
         d0 = vadd_s16(d0, round_offset64);
 
@@ -1602,29 +1591,29 @@
         __builtin_prefetch(dst_ptr + 2 * dst_stride);
         __builtin_prefetch(dst_ptr + 3 * dst_stride);
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                                 zero, shift_vec);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                                 shift_vec);
         res0 = vaddq_s16(res0, round_offset128);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
-                                 zero, shift_vec);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+                                 shift_vec);
         res1 = vaddq_s16(res1, round_offset128);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
-                                 zero, shift_vec);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+                                 shift_vec);
         res2 = vaddq_s16(res2, round_offset128);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
                                  zero, shift_vec);
         res3 = vaddq_s16(res3, round_offset128);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
                                  zero, shift_vec);
         res4 = vaddq_s16(res4, round_offset128);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 y_filter_tmp, zero, shift_vec);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
+                                 zero, shift_vec);
         res5 = vaddq_s16(res5, round_offset128);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 y_filter_tmp, zero, shift_vec);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
+                                 zero, shift_vec);
         res6 = vaddq_s16(res6, round_offset128);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 y_filter_tmp, zero, shift_vec);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
+                                 zero, shift_vec);
         res7 = vaddq_s16(res7, round_offset128);
 
         if (conv_params->do_average) {
@@ -1682,8 +1671,8 @@
 
         __builtin_prefetch(dst_ptr);
 
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
-                                 zero, shift_vec);
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+                                 shift_vec);
         res0 = vaddq_s16(res0, round_offset128);
 
         s0 = s1;