Optimize av1_convolve_2d_sr_neon using SDOT instruction

Add an alternative AArch64 implementation of
av1_convolve_2d_sr_horiz_neon - the horizontal portion of
av1_convolve_2d_sr_neon - using the Armv8.4-A SDOT (signed dot-
product) instruction.

The existing MLA-based implementation of is retained and used on
target CPUs that do not implement the SDOT instruction (or CPUs
executing in AArch32 mode.) The availability of the SDOT instruction
is indicated by the feature macro __ARM_FEATURE_DOTPROD.

Change-Id: Ib22663122bc19f000300f088ae5691f9bcf977d1
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 1a93f35..40be27d 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -15,29 +15,63 @@
 #include <string.h>
 #include "aom_dsp/aom_dsp_common.h"
 
-// Support for these xN intrinsics is lacking in older compilers.
-#if (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) || \
-    (defined(__GNUC__) &&                                               \
-     ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||    \
-      (defined(__clang__) && defined(__arm__) &&                        \
-       (__clang_major__ <= 6 ||                                         \
-        (defined(__ANDROID__) && __clang_major__ <= 7)))))
-static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+// Support for xN Neon intrinsics is lacking in some compilers.
+#if defined(__arm__) || defined(_M_ARM)
+#define ARM_32_BIT
+#endif
+
+// DEFICIENT_CLANG_32_BIT includes clang-cl.
+#if defined(__clang__) && defined(ARM_32_BIT) && \
+    (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
+#define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
+#define GCC_32_BIT
+#endif
+
+#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
+
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
 
-static INLINE uint16x8x4_t vld1q_u16_x4(uint16_t const *ptr) {
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
   return res;
 }
-#endif  // (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) ||
-        // (defined(__GNUC__) &&
-        //  ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||
-        //   (defined(__clang__) && defined(__arm__) &&
-        //    (__clang_major__ <= 6 ||
-        //     (defined(__ANDROID__) && __clang_major__ <= 7)))))
+
+#elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
+#if __GNUC__ < 8
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+#endif  // __GNUC__ < 8
+
+#if __GNUC__ < 9
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif  // __GNUC__ < 9
+#endif  // defined(__GNUC__) && !defined(__clang__)
 
 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
                                      const uint8x8_t s1) {
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 7387a13..b12c279 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -844,6 +844,170 @@
   }
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void av1_convolve_2d_sr_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+    int im_h, const int16x8_t x_filter_s16, const int round_0) {
+  const int bd = 8;
+
+  const uint8_t *src_ptr = src;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+
+  int height = im_h;
+
+  // Filter values are even, so downshift by 1 to reduce intermediate precision
+  // requirements.
+  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+  // Dot product constants.
+  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
+  const int32x4_t correction =
+      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert(round_0 > 0);
+
+  if (w <= 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      load_u8_8x16(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
+                               permute_tbl);
+      t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
+                               permute_tbl);
+      t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
+                               permute_tbl);
+      t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
+                               permute_tbl);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      } else {
+        vst1_s16(dst_ptr + 0 * dst_stride, d0);
+        vst1_s16(dst_ptr + 1 * dst_stride, d1);
+        vst1_s16(dst_ptr + 2 * dst_stride, d2);
+        vst1_s16(dst_ptr + 3 * dst_stride, d3);
+      }
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        s0 = vld1q_u8(src_ptr);
+        t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
+                                 permute_tbl);
+        d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+
+        if (w == 2) {
+          vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        } else {
+          vst1_s16(dst_ptr, d0);
+        }
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      assert(height >= 4);
+
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+
+      do {
+        const uint8_t *s = src_ptr;
+        int16_t *d = dst_ptr;
+        int width = w;
+
+        do {
+          s0 = vld1q_u8(s);
+          d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
+                                   permute_tbl, shift_round_0);
+          vst1q_s16(d, d0);
+
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        height--;
+      } while (height > 0);
+    }
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
 // Horizontal filtering for convolve_2d_sr for width multiple of 8
 // Processes one row at a time
 static INLINE void horiz_filter_w8_single_row(
@@ -1197,6 +1361,8 @@
   }
 }
 
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE void av1_convolve_2d_sr_vert_neon(
     int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
     int h, const int16x8_t y_filter, ConvolveParams *conv_params) {
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 05d781e..e8c2ad6 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -230,6 +230,75 @@
   return sum;
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int32x4_t convolve8_4_dot_s16(uint8x16_t samples,
+                                            const int8x8_t filters,
+                                            const int32x4_t correction,
+                                            const uint8x16_t range_limit,
+                                            const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE int16x8_t convolve8_8_dot_s16(uint8x16_t samples,
+                                            const int8x8_t filters,
+                                            const int32x4_t correction,
+                                            const uint8x16_t range_limit,
+                                            const uint8x16x3_t permute_tbl,
+                                            const int16x8_t shift_round_0) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  return vqrshlq_s16(sum, shift_round_0);
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE int16x4_t convolve8_4x4_s16(
     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,