Optimize av1_convolve_2d_sr_neon using SDOT instruction Add an alternative AArch64 implementation of av1_convolve_2d_sr_horiz_neon - the horizontal portion of av1_convolve_2d_sr_neon - using the Armv8.4-A SDOT (signed dot- product) instruction. The existing MLA-based implementation of is retained and used on target CPUs that do not implement the SDOT instruction (or CPUs executing in AArch32 mode.) The availability of the SDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Change-Id: Ib22663122bc19f000300f088ae5691f9bcf977d1

commit: 11bd02104d7b697820a56eb568b96db313ac8391 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Thu Jun 02 18:09:22 2022 +0100
committer: James Zern <jzern@google.com> Tue Sep 20 05:18:26 2022 +0000
tree: f9ea15d422c9f76d1aace78b5be10759def58323
parent: ac1b09ba78894d7391ae186cb3876cf390f1d9be [diff]
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 1a93f35..40be27d 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h

@@ -15,29 +15,63 @@
 #include <string.h>
 #include "aom_dsp/aom_dsp_common.h"
 
-// Support for these xN intrinsics is lacking in older compilers.
-#if (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) || \
-    (defined(__GNUC__) &&                                               \
-     ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||    \
-      (defined(__clang__) && defined(__arm__) &&                        \
-       (__clang_major__ <= 6 ||                                         \
-        (defined(__ANDROID__) && __clang_major__ <= 7)))))
-static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+// Support for xN Neon intrinsics is lacking in some compilers.
+#if defined(__arm__) || defined(_M_ARM)
+#define ARM_32_BIT
+#endif
+
+// DEFICIENT_CLANG_32_BIT includes clang-cl.
+#if defined(__clang__) && defined(ARM_32_BIT) && \
+    (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
+#define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
+#define GCC_32_BIT
+#endif
+
+#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
+
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
 
-static INLINE uint16x8x4_t vld1q_u16_x4(uint16_t const *ptr) {
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
   return res;
 }
-#endif  // (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) ||
-        // (defined(__GNUC__) &&
-        //  ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||
-        //   (defined(__clang__) && defined(__arm__) &&
-        //    (__clang_major__ <= 6 ||
-        //     (defined(__ANDROID__) && __clang_major__ <= 7)))))
+
+#elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
+#if __GNUC__ < 8
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+#endif  // __GNUC__ < 8
+
+#if __GNUC__ < 9
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif  // __GNUC__ < 9
+#endif  // defined(__GNUC__) && !defined(__clang__)
 
 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
                                      const uint8x8_t s1) {

diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 7387a13..b12c279 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c

@@ -844,6 +844,170 @@
   }
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void av1_convolve_2d_sr_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16x8_t x_filter_s16, const int round_0) {
+  const int bd = 8;
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+
+  int height = im_h;
+
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+  // Dot product constants.
+  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
+  const int32x4_t correction =
+      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert(round_0 > 0);
+
+  if (w <= 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      load_u8_8x16(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
+                               permute_tbl);
+      t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
+                               permute_tbl);
+      t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
+                               permute_tbl);
+      t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
+                               permute_tbl);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      } else {
+        vst1_s16(dst_ptr + 0 * dst_stride, d0);
+        vst1_s16(dst_ptr + 1 * dst_stride, d1);
+        vst1_s16(dst_ptr + 2 * dst_stride, d2);
+        vst1_s16(dst_ptr + 3 * dst_stride, d3);
+      }
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        s0 = vld1q_u8(src_ptr);
+        t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
+                                 permute_tbl);
+        d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+
+        if (w == 2) {
+          vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        } else {
+          vst1_s16(dst_ptr, d0);
+        }
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          s0 = vld1q_u8(s);
+          d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
+                                   permute_tbl, shift_round_0);
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
 // Horizontal filtering for convolve_2d_sr for width multiple of 8
 // Processes one row at a time
 static INLINE void horiz_filter_w8_single_row(
@@ -1197,6 +1361,8 @@
   }
 }
 
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE void av1_convolve_2d_sr_vert_neon(
     int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
     int h, const int16x8_t y_filter, ConvolveParams *conv_params) {

diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 05d781e..e8c2ad6 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h

@@ -230,6 +230,75 @@
   return sum;
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int32x4_t convolve8_4_dot_s16(uint8x16_t samples,
+                                            const int8x8_t filters,
+                                            const int32x4_t correction,
+                                            const uint8x16_t range_limit,
+                                            const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE int16x8_t convolve8_8_dot_s16(uint8x16_t samples,
+                                            const int8x8_t filters,
+                                            const int32x4_t correction,
+                                            const uint8x16_t range_limit,
+                                            const uint8x16x3_t permute_tbl,
+                                            const int16x8_t shift_round_0) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  return vqrshlq_s16(sum, shift_round_0);
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE int16x4_t convolve8_4x4_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
commit	11bd02104d7b697820a56eb568b96db313ac8391	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Thu Jun 02 18:09:22 2022 +0100
committer	James Zern <jzern@google.com>	Tue Sep 20 05:18:26 2022 +0000
tree	f9ea15d422c9f76d1aace78b5be10759def58323
parent	ac1b09ba78894d7391ae186cb3876cf390f1d9be [diff]