sum_neon.h: gather horizontal_add_*() functions

this allows for consistent use of vaddv with aarch64

Bug: b/217282899
Bug: b/231719821
Change-Id: I16de3905c72aa79837fbabc01b1e7ea281792e89
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 42133b8..b295d71 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -17,6 +17,15 @@
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 
+#if !defined(__aarch64__)
+static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                  vreinterpret_u32_u64(vget_high_u64(c)));
+}
+#endif
+
 unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
   const uint8x16_t b = load_unaligned_u8q(a, a_stride);
   const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
@@ -24,14 +33,13 @@
   const uint32_t d = vaddlvq_u16(c);
   return (d + 8) >> 4;
 #else
-  const uint32x2_t d = horizontal_add_u16x8(c);
+  const uint32x2_t d = horizontal_add_u16x8_v(c);
   return vget_lane_u32(vrshr_n_u32(d, 4), 0);
 #endif
 }
 
 unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
   uint16x8_t sum;
-  uint32x2_t d;
   uint8x8_t b = vld1_u8(a);
   a += a_stride;
   uint8x8_t c = vld1_u8(a);
@@ -44,9 +52,13 @@
     sum = vaddw_u8(sum, e);
   }
 
-  d = horizontal_add_u16x8(sum);
-
+#if defined(__aarch64__)
+  const uint32_t d = vaddlvq_u16(sum);
+  return (d + 32) >> 6;
+#else
+  const uint32x2_t d = horizontal_add_u16x8_v(sum);
   return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+#endif
 }
 
 void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
@@ -157,11 +169,7 @@
   } while (length != 0);
 
   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-#ifdef __aarch64__
-  return vaddvq_s32(accum);
-#else
   return horizontal_add_s32x4(accum);
-#endif  // __aarch64__
 }
 
 int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
@@ -186,13 +194,8 @@
     v_sse = vmlal_s16(v_sse, v_high, v_high);
 #endif
   }
-#if defined(__aarch64__)
-  int mean = vaddvq_s32(v_mean);
-  int sse = (int)vaddvq_s32(v_sse);
-#else
   int mean = horizontal_add_s32x4(v_mean);
   int sse = horizontal_add_s32x4(v_sse);
-#endif
   // (mean * mean): dynamic range 31 bits.
   int var = sse - ((mean * mean) >> (bwl + 2));
   return var;
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index 22f2e64..b62628e 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -15,19 +15,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
+#include "aom_dsp/arm/sum_neon.h"
 
 // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
 // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
@@ -120,10 +108,10 @@
     ref3 += ref_stride;
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
 
 void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
@@ -164,10 +152,10 @@
     ref3 += ref_stride;
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
 
 void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
@@ -219,24 +207,10 @@
     ref3 += ref_stride;
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-static INLINE unsigned int horizontal_add_16x4(const uint16x4_t vec_16x4) {
-  const uint32x2_t a = vpaddl_u16(vec_16x4);
-  const uint64x1_t b = vpaddl_u32(a);
-  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-}
-
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
+  res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
 
 static void sad_row4_neon(uint16x4_t *vec_src, const uint8x8_t q0,
@@ -330,10 +304,10 @@
         sad_row4_neon(&q2, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q6));
         sad_row4_neon(&q3, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q7));
 
-        res[0] += horizontal_add_16x4(q0);
-        res[1] += horizontal_add_16x4(q1);
-        res[2] += horizontal_add_16x4(q2);
-        res[3] += horizontal_add_16x4(q3);
+        res[0] += horizontal_add_u16x4(q0);
+        res[1] += horizontal_add_u16x4(q1);
+        res[2] += horizontal_add_u16x4(q2);
+        res[3] += horizontal_add_u16x4(q3);
       }
       break;
     }
@@ -357,10 +331,10 @@
         ref2 += ref_stride;
         ref3 += ref_stride;
 
-        res[0] += horizontal_add_16x4(q0);
-        res[1] += horizontal_add_16x4(q1);
-        res[2] += horizontal_add_16x4(q2);
-        res[3] += horizontal_add_16x4(q3);
+        res[0] += horizontal_add_u16x4(q0);
+        res[1] += horizontal_add_u16x4(q1);
+        res[2] += horizontal_add_u16x4(q2);
+        res[3] += horizontal_add_u16x4(q3);
       }
       break;
     }
@@ -384,10 +358,10 @@
         ref2 += ref_stride;
         ref3 += ref_stride;
 
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
+        res[0] += horizontal_add_u16x8(q0);
+        res[1] += horizontal_add_u16x8(q1);
+        res[2] += horizontal_add_u16x8(q2);
+        res[3] += horizontal_add_u16x8(q3);
       }
       break;
     }
@@ -418,10 +392,10 @@
         ref2 += ref_stride;
         ref3 += ref_stride;
 
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
+        res[0] += horizontal_add_u16x8(q0);
+        res[1] += horizontal_add_u16x8(q1);
+        res[2] += horizontal_add_u16x8(q2);
+        res[3] += horizontal_add_u16x8(q3);
       }
       break;
     }
@@ -466,10 +440,10 @@
         ref2 += ref_stride;
         ref3 += ref_stride;
 
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
+        res[0] += horizontal_add_u16x8(q0);
+        res[1] += horizontal_add_u16x8(q1);
+        res[2] += horizontal_add_u16x8(q2);
+        res[3] += horizontal_add_u16x8(q3);
       }
       break;
     }
@@ -542,10 +516,10 @@
         ref2 += ref_stride;
         ref3 += ref_stride;
 
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
+        res[0] += horizontal_add_u16x8(q0);
+        res[1] += horizontal_add_u16x8(q1);
+        res[2] += horizontal_add_u16x8(q2);
+        res[3] += horizontal_add_u16x8(q3);
       }
     }
   }
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 4f0a199..acd2c54 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -13,6 +13,7 @@
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
 
 unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride,
                               const uint8_t *ref_ptr, int ref_stride) {
@@ -107,26 +108,6 @@
   return vget_lane_u32(d5, 0);
 }
 
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
 unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
@@ -160,7 +141,7 @@
     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
                             vget_high_u8(vec_ref_48));
   }
-  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+  return horizontal_long_add_u16x8(vec_accum_lo, vec_accum_hi);
 }
 
 unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
@@ -256,7 +237,7 @@
     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
                             vget_high_u8(vec_ref_16));
   }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  return horizontal_add_u16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
 unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
@@ -275,7 +256,7 @@
     vec_accum_hi =
         vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
   }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  return horizontal_add_u16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
 unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
@@ -290,7 +271,7 @@
     ref += ref_stride;
     vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
   }
-  return horizontal_add_16x8(vec_accum);
+  return horizontal_add_u16x8(vec_accum);
 }
 
 static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -343,7 +324,7 @@
     src_ptr += src_stride;
     ref_ptr += ref_stride;
 
-    sum += horizontal_add_16x8(q3);
+    sum += horizontal_add_u16x8(q3);
   }
 
   return sum;
@@ -379,7 +360,7 @@
     src_ptr += src_stride;
     ref_ptr += ref_stride;
 
-    sum += horizontal_add_16x8(q3);
+    sum += horizontal_add_u16x8(q3);
   }
 
   return sum;
@@ -402,7 +383,7 @@
     q2 = vabdq_u8(q0, q1);
     q3 = vpadalq_u8(q3, q2);
 
-    sum += horizontal_add_16x8(q3);
+    sum += horizontal_add_u16x8(q3);
 
     src_ptr += src_stride;
     ref_ptr += ref_stride;
@@ -447,7 +428,7 @@
     ref_ptr += ref_stride;
     q3 = vabal_u8(q3, q0, q1);
   }
-  return horizontal_add_16x8(q3);
+  return horizontal_add_u16x8(q3);
 }
 
 static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -474,7 +455,7 @@
 
     q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1));
   }
-  return horizontal_add_16x8(q3);
+  return horizontal_add_u16x8(q3);
 }
 
 #define FSADS128_H(h)                                                    \
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index 35b784a..a69dfb5 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -63,11 +63,7 @@
         b += b_stride << 1;
         y += 2;
       } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
     case 8:
       do {
@@ -76,11 +72,7 @@
         b += b_stride;
         y += 1;
       } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
     case 16:
       do {
@@ -89,11 +81,7 @@
         b += b_stride;
         y += 1;
       } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
     case 32:
       do {
@@ -103,11 +91,7 @@
         b += b_stride;
         y += 1;
       } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
     case 64:
       do {
@@ -119,11 +103,7 @@
         b += b_stride;
         y += 1;
       } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
     case 128:
       do {
@@ -139,11 +119,7 @@
         b += b_stride;
         y += 1;
       } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
     default:
       if (width & 0x07) {
@@ -171,11 +147,7 @@
           y += 1;
         } while (y < height);
       }
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
       sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
       break;
   }
   return sse;
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index 809e51c..a118f3c 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -14,24 +14,64 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
+static INLINE int horizontal_add_s16x8(const int16x8_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_s16(a);
+#else
+  const int32x4_t b = vpaddlq_s16(a);
+  const int64x2_t c = vpaddlq_s32(b);
+  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                               vreinterpret_s32_s64(vget_high_s64(c)));
+  return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_s32(a);
+#else
   const int64x2_t b = vpaddlq_s32(a);
   const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                                vreinterpret_s32_s64(vget_high_s64(b)));
   return vget_lane_s32(c, 0);
+#endif
 }
 
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
+static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
+                                                 const uint16x8_t vec_hi) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) {
+static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(a);
+#else
   const uint32x4_t b = vpaddlq_u16(a);
   const uint64x2_t c = vpaddlq_u32(b);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                  vreinterpret_u32_u64(vget_high_u64(c)));
+  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                vreinterpret_u32_u64(vget_high_u64(c)));
+  return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u16(a);
+#else
+  const uint32x2_t b = vpaddl_u16(a);
+  const uint64x1_t c = vpaddl_u32(b);
+  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif
 }
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index e840f13..3378491 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -475,13 +475,8 @@
     b += 4 * b_stride;
   }
 
-#if defined(__aarch64__)
-  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
-  *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
   *sum = horizontal_add_s16x8(sum_s16);
   *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
 }
 
 // Process a block of any size where the width is divisible by 16.
@@ -524,13 +519,8 @@
     b += b_stride;
   }
 
-#if defined(__aarch64__)
-  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
-  *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
   *sum = horizontal_add_s16x8(sum_s16);
   *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
 }
 
 // Process a block of width 8 two rows at a time.
@@ -568,13 +558,8 @@
     i += 2;
   } while (i < h);
 
-#if defined(__aarch64__)
-  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
-  *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
   *sum = horizontal_add_s16x8(sum_s16);
   *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
 }
 
 #define VARIANCE_NXM(n, m, shift)                                           \
@@ -635,11 +620,10 @@
     v_diff = vpadalq_s16(v_diff, sum_s16);
     v_sse = vpadalq_s32(v_sse, sse_s32);
   }
+  int diff = horizontal_add_s32x4(v_diff);
 #if defined(__aarch64__)
-  int diff = vaddvq_s32(v_diff);
   uint32_t sq = (uint32_t)vaddvq_u64(vreinterpretq_u64_s64(v_sse));
 #else
-  int diff = horizontal_add_s32x4(v_diff);
   uint32_t sq = vget_lane_u32(
       vreinterpret_u32_s64(vadd_s64(vget_low_s64(v_sse), vget_high_s64(v_sse))),
       0);
diff --git a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
index ad81f40..3528105 100644
--- a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
+++ b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -24,6 +24,9 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if defined(__aarch64__)
+  return vaddlvq_s8(v_sum_diff_total);
+#else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
   const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
   const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
@@ -31,6 +34,7 @@
                                 vget_low_s64(fedcba98_76543210));
   const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
   return sum_diff;
+#endif
 }
 
 // Denoise a 16x1 vector.
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/neon/quantize_neon.c
index 8b5888f..0a87503 100644
--- a/av1/encoder/arm/neon/quantize_neon.c
+++ b/av1/encoder/arm/neon/quantize_neon.c
@@ -15,6 +15,7 @@
 #include <math.h>
 
 #include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/quant_common.h"
@@ -206,17 +207,6 @@
   return v_nz_mask;
 }
 
-static INLINE uint32_t sum_abs_coeff(const uint16x8_t a) {
-#if defined(__aarch64__)
-  return vaddvq_u16(a);
-#else
-  const uint32x4_t b = vpaddlq_u16(a);
-  const uint64x2_t c = vpaddlq_u32(b);
-  const uint64x1_t d = vadd_u64(vget_low_u64(c), vget_high_u64(c));
-  return (uint32_t)vget_lane_u64(d, 0);
-#endif
-}
-
 static void quantize_fp_no_qmatrix_neon(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
     const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -246,7 +236,7 @@
     const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
     const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
     // If the coefficient is in the base ZBIN range, then discard.
-    if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+    if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
       non_zero_count -= 16;
     } else {
       break;