Implement speciality variance functions using UDOT instruction

Accelerate the following Neon speciality variance (helper) functions
by calculating the sum of squares using a single Armv8.4-A UDOT
instruction instead of four MLAs:
 * mse8xh_neon
 * mse16xh_neon
 * aom_get4x4sse_cs_neon

The previous implementation is retained for use on systems that do
not support the Armv8.4-A dot-product instructions.

Change-Id: I20b083dc36d7f7bb5550ad82b384d7b96667e5ea
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 17d815c..f078705 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -401,6 +401,72 @@
   }
 }
 
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       unsigned int *sse, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = 0;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        unsigned int *sse, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t s1 = vld1q_u8(src + src_stride);
+    uint8x16_t r0 = vld1q_u8(ref);
+    uint8x16_t r1 = vld1q_u8(ref + ref_stride);
+
+    uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+    uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i += 2;
+  } while (i < h);
+
+  *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+unsigned int aom_get4x4sse_cs_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *ref, int ref_stride) {
+  uint8x16_t s = load_unaligned_u8q(src, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_u32x4(sse);
+}
+
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
 static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
                                        const uint8_t *ref, int ref_stride,
                                        unsigned int *sse, int h) {
@@ -498,21 +564,6 @@
   return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
 }
 
-#define MSE_WXH_NEON(w, h)                                                 \
-  unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse) {                \
-    return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h);      \
-  }
-
-MSE_WXH_NEON(8, 8)
-MSE_WXH_NEON(8, 16)
-
-MSE_WXH_NEON(16, 8)
-MSE_WXH_NEON(16, 16)
-
-#undef MSE_WXH_NEON
-
 unsigned int aom_get4x4sse_cs_neon(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride) {
   uint8x8_t s[4], r[4];
@@ -546,3 +597,20 @@
 
   return horizontal_add_u32x4(vreinterpretq_u32_s32(sse));
 }
+
+#endif  // defined(__ARM_FEATURE_DOTPROD)
+
+#define MSE_WXH_NEON(w, h)                                                 \
+  unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse) {                \
+    return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h);      \
+  }
+
+MSE_WXH_NEON(8, 8)
+MSE_WXH_NEON(8, 16)
+
+MSE_WXH_NEON(16, 8)
+MSE_WXH_NEON(16, 16)
+
+#undef MSE_WXH_NEON