Optimize 4D Neon reduction for 4xh and 8xh SAD4D blocks

Add a 4D reduction function operating on uint16x8_t vectors and use
it to optimize the final reduction in standard bitdepth 4xh and 8xh
SAD4D computations. Similar 4D reduction optimizations have already
been implemented for all other standard bitdepth block sizes.[1]

[1] https://aomedia-review.googlesource.com/c/aom/+/170281

Change-Id: I33ff7b8c6a232e748972937da3c15ea25f255a1e
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index de24a86..81ec908 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -407,10 +407,7 @@
     i++;
   } while (i < h);
 
-  res[0] = horizontal_add_u16x8(sum[0]);
-  res[1] = horizontal_add_u16x8(sum[1]);
-  res[2] = horizontal_add_u16x8(sum[2]);
-  res[3] = horizontal_add_u16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
 static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
@@ -435,10 +432,7 @@
     i += 2;
   } while (i < h);
 
-  res[0] = horizontal_add_u16x8(sum[0]);
-  res[1] = horizontal_add_u16x8(sum[1]);
-  res[2] = horizontal_add_u16x8(sum[2]);
-  res[3] = horizontal_add_u16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
 #define SAD_WXH_4D_NEON(w, h)                                                  \
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index e4f724c..c0bfc69 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -109,6 +109,23 @@
 #endif
 }
 
+static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
 static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
 #if defined(__aarch64__)
   return vaddv_u32(a);