Optimize av1_dist_wtd_convolve_2d_neon using SDOT instruction

Add an alternative AArch64 implementation of the
av1_dist_wtd_convolve_2d_horiz_neon helper function for targets that
implement the Armv8.4-A SDOT (signed dot-product) instruction.

The existing MLA-based implementation of
av1_dist_wtd_convolve_2d_horiz_neon is retained for use on target
CPUs that do not implement the SDOT instruction (or CPUs executing in
AArch32 mode.) The availability of the SDOT instruction is indicated
by the feature macro __ARM_FEATURE_DOTPROD.

Change-Id: Ia9175ddcf12516692d757f273e6ab525fe9fad30
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 6f4f58e..ddf55bc 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -281,6 +281,108 @@
   }
 }
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    const int16x8_t x_filter_s16, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  int16_t *dst_ptr = im_block;
+  int dst_stride = im_stride;
+  int width = w;
+  int height = im_h;
+
+  const int8x8_t x_filter = vmovn_s16(x_filter_s16);
+  const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+  // Dot product constants.
+  const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 7);
+  const int32x4_t correction =
+      vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+    uint8x16_t s0, s1, s2, s3;
+    int32x4_t t0, t1, t2, t3;
+    int16x4_t d0, d1, d2, d3;
+
+    do {
+      s0 = vld1q_u8(src + 0 * src_stride);
+      s1 = vld1q_u8(src + 1 * src_stride);
+      s2 = vld1q_u8(src + 2 * src_stride);
+      s3 = vld1q_u8(src + 3 * src_stride);
+
+      t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
+                               permute_tbl);
+      t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
+                               permute_tbl);
+      t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
+                               permute_tbl);
+      t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
+                               permute_tbl);
+
+      d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
+      d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
+      d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
+      d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+    const uint8_t *s;
+    int16_t *d;
+    uint8x16_t s0, s1, s2, s3;
+    int16x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst_ptr;
+
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+        d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
+                                 permute_tbl, shift_round_0);
+
+        vst1q_s16(d + 0 * dst_stride, d0);
+        vst1q_s16(d + 1 * dst_stride, d1);
+        vst1q_s16(d + 2 * dst_stride, d2);
+        vst1q_s16(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+
 static INLINE void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     const int16x8_t x_filter, const int im_h, int w, const int round_0) {
@@ -525,6 +627,8 @@
   }
 }
 
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE void dist_wtd_convolve_2d_vert_neon(
     int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
     ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {