Optimize 4D Neon reduction for 4xh and 8xh SAD4D blocks Add a 4D reduction function operating on uint16x8_t vectors and use it to optimize the final reduction in standard bitdepth 4xh and 8xh SAD4D computations. Similar 4D reduction optimizations have already been implemented for all other standard bitdepth block sizes.[1] [1] https://aomedia-review.googlesource.com/c/aom/+/170281 Change-Id: I33ff7b8c6a232e748972937da3c15ea25f255a1e

commit: 3e1b9c5fb403dc9cd20c544a1bdfe084ad87f988 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Thu Apr 06 00:38:48 2023 +0100
committer: James Zern <jzern@google.com> Thu Apr 06 18:23:08 2023 +0000
tree: c6a42fe28f52e26e13f33f5a1321adb562d8e49b
parent: 341adedfb04c3e3f8544c1fb6f36a027c2fd50e6 [diff]
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index de24a86..81ec908 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c

@@ -407,10 +407,7 @@
     i++;
   } while (i < h);
 
-  res[0] = horizontal_add_u16x8(sum[0]);
-  res[1] = horizontal_add_u16x8(sum[1]);
-  res[2] = horizontal_add_u16x8(sum[2]);
-  res[3] = horizontal_add_u16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
 static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
@@ -435,10 +432,7 @@
     i += 2;
   } while (i < h);
 
-  res[0] = horizontal_add_u16x8(sum[0]);
-  res[1] = horizontal_add_u16x8(sum[1]);
-  res[2] = horizontal_add_u16x8(sum[2]);
-  res[3] = horizontal_add_u16x8(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
 
 #define SAD_WXH_4D_NEON(w, h)                                                  \

diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index e4f724c..c0bfc69 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h

@@ -109,6 +109,23 @@
 #endif
 }
 
+static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
 static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
 #if defined(__aarch64__)
   return vaddv_u32(a);
commit	3e1b9c5fb403dc9cd20c544a1bdfe084ad87f988	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Thu Apr 06 00:38:48 2023 +0100
committer	James Zern <jzern@google.com>	Thu Apr 06 18:23:08 2023 +0000
tree	c6a42fe28f52e26e13f33f5a1321adb562d8e49b
parent	341adedfb04c3e3f8544c1fb6f36a027c2fd50e6 [diff]