Implement sad<w>xh_neon functions using UDOT instruction Add an alternative implementation of sad128xh_neon, sad64xh_neon, sad32xh_neon and sad16xh_neon using ADB, UDOT instead of ADB, UADALP. UDOT is higher throughput and lower latency than UADALP on modern Arm CPUs and accumulating into 32-bit elements removes the need for an extended reduction at the end of the loop. The existing implementation is retained for CPUs that do not implement the Armv8.4-A UDOT instruction. The availability of the UDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Change-Id: Ic0f2f99d76b480da7277c734400168d6ace3d462

commit: 0858e572ba9df28d2644ce65e10a6ec45ca66181 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Tue Jun 07 23:10:48 2022 +0100
committer: James Zern <jzern@google.com> Thu Sep 01 04:51:50 2022 +0000
tree: ba005ea6a4e7af92fcb4641789e28a74a73976a4
parent: 7aaca88828a5df3af40069b9b0a8601b212e516d [diff]
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 6cb7a03..946d8f9 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c

@@ -15,6 +15,92 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/sum_neon.h"
 
+#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+
+static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    i++;
+  } while (i < h);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    i++;
+  } while (i < h / 2);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else
+
 static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
                                          const uint8_t *ref_ptr, int ref_stride,
                                          int h) {
@@ -181,6 +267,8 @@
   return horizontal_add_u16x8(sum);
 }
 
+#endif
+
 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
                                        const uint8_t *ref_ptr, int ref_stride,
                                        int h) {
commit	0858e572ba9df28d2644ce65e10a6ec45ca66181	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Tue Jun 07 23:10:48 2022 +0100
committer	James Zern <jzern@google.com>	Thu Sep 01 04:51:50 2022 +0000
tree	ba005ea6a4e7af92fcb4641789e28a74a73976a4
parent	7aaca88828a5df3af40069b9b0a8601b212e516d [diff]