Import SIMD intrinsics from more recent Thor code

This enables ARMv8/aarch64 optimisations of CDEF as well as a few
minor improvements to x86 and ARMv7.  Several new intrinsics also
added, which makes it possible to remove x86 specific code in the CDEF
code.  Also, various sanitizer warnings have been addressed (mostly
related to intended two-complement overflow/underflow).  And there are
several AVX2 improvements.

New intrinsics: v64_sadd_s8, v64_sadd_u8, v64_pack_s32_u16,
v64_rdavg_u16, v128_sad_u16, v128_ssd_s16, v128_sadd_s8, v128_sadd_u8,
v128_add_64, v128_sub_64, v128_pack_s32_u16, v128_rdavg_u16,
v128_min_s32, v128_max_s32, v128_cmpgt_s32, v128_cmpeq_32,
v128_cmplt_s32, v128_padd_u8, v128_shl_n_64, v128_shr_n_u64,
v128_shr_n_s64, v128_shr_s64, v128_shr_u64, v128_shl_64,
v128_dotp_su8, v128_dotp_s32, v128_movemask_8, v128_dup_64,
v128_blend_8, v256_sad_u16, v256_ssd_s16, v256_low_u64, v256_dotp_su8,
v256_dotp_s32, v256_sadd_s8, v256_sadd_u8, v256_add_64, v256_sub_64,
v256_pack_s32_u16, v256_rdavg_u16, v256_min_s32, v256_max_s32,
v256_cmpgt_s32, v256_cmplt_s32, v256_cmpeq_32, v256_wideshuffle_8,
v256_padd_u8, v256_shl_n_64, v256_shr_n_u64, v256_shr_n_s64,
v256_shr_s64, v256_shr_u64, v256_shl_64, v256_movemask_8, v256_dup_64,
v256_blend_8, v256_unziplo_64, v256_unziphi_64

The unit tests have been updated.

Change-Id: If051e902f2095e3a02aaf13cf1230475392f051e
diff --git a/test/simd_cmp_impl.h b/test/simd_cmp_impl.h
index 6c79cbc..386efba 100644
--- a/test/simd_cmp_impl.h
+++ b/test/simd_cmp_impl.h
@@ -181,6 +181,18 @@
   return v128_shr_n_s32(a, shift);
 }
 template <int shift>
+v128 imm_v128_shl_n_64(v128 a) {
+  return v128_shl_n_64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u64(v128 a) {
+  return v128_shr_n_u64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s64(v128 a) {
+  return v128_shr_n_s64(a, shift);
+}
+template <int shift>
 v128 imm_v128_align(v128 a, v128 b) {
   return v128_align(a, b, shift);
 }
@@ -230,6 +242,18 @@
   return c_v128_shr_n_s32(a, shift);
 }
 template <int shift>
+c_v128 c_imm_v128_shl_n_64(c_v128 a) {
+  return c_v128_shl_n_64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
+  return c_v128_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
+  return c_v128_shr_n_s64(a, shift);
+}
+template <int shift>
 c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
   return c_v128_align(a, b, shift);
 }
@@ -279,6 +303,18 @@
   return v256_shr_n_s32(a, shift);
 }
 template <int shift>
+v256 imm_v256_shl_n_64(v256 a) {
+  return v256_shl_n_64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u64(v256 a) {
+  return v256_shr_n_u64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s64(v256 a) {
+  return v256_shr_n_s64(a, shift);
+}
+template <int shift>
 v256 imm_v256_align(v256 a, v256 b) {
   return v256_align(a, b, shift);
 }
@@ -328,6 +364,18 @@
   return c_v256_shr_n_s32(a, shift);
 }
 template <int shift>
+c_v256 c_imm_v256_shl_n_64(c_v256 a) {
+  return c_v256_shl_n_64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
+  return c_v256_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
+  return c_v256_shr_n_s64(a, shift);
+}
+template <int shift>
 c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
   return c_v256_align(a, b, shift);
 }
@@ -358,6 +406,18 @@
 uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
   return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
 }
+uint32_t v128_sad_u16(v128 a, v128 b) {
+  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
+}
+uint64_t v128_ssd_s16(v128 a, v128 b) {
+  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
+}
+uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
+  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
+}
+uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
+  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
+}
 uint32_t v256_sad_u8(v256 a, v256 b) {
   return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
 }
@@ -370,6 +430,18 @@
 uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
   return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
 }
+uint32_t v256_sad_u16(v256 a, v256 b) {
+  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
+}
+uint64_t v256_ssd_s16(v256 a, v256 b) {
+  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
+}
+uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
+  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
+}
+uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
+  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
+}
 
 namespace {
 
@@ -391,6 +463,8 @@
                       MAP(v64_ssd_u8),
                       MAP(v64_add_8),
                       MAP(v64_add_16),
+                      MAP(v64_sadd_s8),
+                      MAP(v64_sadd_u8),
                       MAP(v64_sadd_s16),
                       MAP(v64_add_32),
                       MAP(v64_sub_8),
@@ -406,6 +480,7 @@
                       MAP(v64_ziphi_16),
                       MAP(v64_ziplo_32),
                       MAP(v64_ziphi_32),
+                      MAP(v64_pack_s32_u16),
                       MAP(v64_pack_s32_s16),
                       MAP(v64_pack_s16_u8),
                       MAP(v64_pack_s16_s8),
@@ -424,6 +499,7 @@
                       MAP(v64_madd_us8),
                       MAP(v64_avg_u8),
                       MAP(v64_rdavg_u8),
+                      MAP(v64_rdavg_u16),
                       MAP(v64_avg_u16),
                       MAP(v64_min_u8),
                       MAP(v64_max_u8),
@@ -564,10 +640,15 @@
                       MAP(v64_from_16),
                       MAP(v128_sad_u8),
                       MAP(v128_ssd_u8),
+                      MAP(v128_sad_u16),
+                      MAP(v128_ssd_s16),
                       MAP(v128_add_8),
                       MAP(v128_add_16),
+                      MAP(v128_sadd_s8),
+                      MAP(v128_sadd_u8),
                       MAP(v128_sadd_s16),
                       MAP(v128_add_32),
+                      MAP(v128_add_64),
                       MAP(v128_sub_8),
                       MAP(v128_ssub_u8),
                       MAP(v128_ssub_s8),
@@ -575,6 +656,7 @@
                       MAP(v128_ssub_s16),
                       MAP(v128_ssub_u16),
                       MAP(v128_sub_32),
+                      MAP(v128_sub_64),
                       MAP(v128_ziplo_8),
                       MAP(v128_ziphi_8),
                       MAP(v128_ziplo_16),
@@ -589,6 +671,7 @@
                       MAP(v128_unziplo_16),
                       MAP(v128_unziphi_32),
                       MAP(v128_unziplo_32),
+                      MAP(v128_pack_s32_u16),
                       MAP(v128_pack_s32_s16),
                       MAP(v128_pack_s16_u8),
                       MAP(v128_pack_s16_s8),
@@ -603,6 +686,7 @@
                       MAP(v128_madd_us8),
                       MAP(v128_avg_u8),
                       MAP(v128_rdavg_u8),
+                      MAP(v128_rdavg_u16),
                       MAP(v128_avg_u16),
                       MAP(v128_min_u8),
                       MAP(v128_max_u8),
@@ -610,12 +694,17 @@
                       MAP(v128_max_s8),
                       MAP(v128_min_s16),
                       MAP(v128_max_s16),
+                      MAP(v128_min_s32),
+                      MAP(v128_max_s32),
                       MAP(v128_cmpgt_s8),
                       MAP(v128_cmplt_s8),
                       MAP(v128_cmpeq_8),
                       MAP(v128_cmpgt_s16),
                       MAP(v128_cmpeq_16),
                       MAP(v128_cmplt_s16),
+                      MAP(v128_cmpgt_s32),
+                      MAP(v128_cmpeq_32),
+                      MAP(v128_cmplt_s32),
                       MAP(v128_shuffle_8),
                       MAP(imm_v128_align<1>),
                       MAP(imm_v128_align<2>),
@@ -634,6 +723,7 @@
                       MAP(imm_v128_align<15>),
                       MAP(v128_abs_s8),
                       MAP(v128_abs_s16),
+                      MAP(v128_padd_u8),
                       MAP(v128_padd_s16),
                       MAP(v128_unpacklo_u16_s32),
                       MAP(v128_unpacklo_s16_s32),
@@ -738,6 +828,54 @@
                       MAP(imm_v128_shr_n_s32<20>),
                       MAP(imm_v128_shr_n_s32<24>),
                       MAP(imm_v128_shr_n_s32<28>),
+                      MAP(imm_v128_shl_n_64<1>),
+                      MAP(imm_v128_shl_n_64<4>),
+                      MAP(imm_v128_shl_n_64<8>),
+                      MAP(imm_v128_shl_n_64<12>),
+                      MAP(imm_v128_shl_n_64<16>),
+                      MAP(imm_v128_shl_n_64<20>),
+                      MAP(imm_v128_shl_n_64<24>),
+                      MAP(imm_v128_shl_n_64<28>),
+                      MAP(imm_v128_shl_n_64<32>),
+                      MAP(imm_v128_shl_n_64<36>),
+                      MAP(imm_v128_shl_n_64<40>),
+                      MAP(imm_v128_shl_n_64<44>),
+                      MAP(imm_v128_shl_n_64<48>),
+                      MAP(imm_v128_shl_n_64<52>),
+                      MAP(imm_v128_shl_n_64<56>),
+                      MAP(imm_v128_shl_n_64<60>),
+                      MAP(imm_v128_shr_n_u64<1>),
+                      MAP(imm_v128_shr_n_u64<4>),
+                      MAP(imm_v128_shr_n_u64<8>),
+                      MAP(imm_v128_shr_n_u64<12>),
+                      MAP(imm_v128_shr_n_u64<16>),
+                      MAP(imm_v128_shr_n_u64<20>),
+                      MAP(imm_v128_shr_n_u64<24>),
+                      MAP(imm_v128_shr_n_u64<28>),
+                      MAP(imm_v128_shr_n_u64<32>),
+                      MAP(imm_v128_shr_n_u64<36>),
+                      MAP(imm_v128_shr_n_u64<40>),
+                      MAP(imm_v128_shr_n_u64<44>),
+                      MAP(imm_v128_shr_n_u64<48>),
+                      MAP(imm_v128_shr_n_u64<52>),
+                      MAP(imm_v128_shr_n_u64<56>),
+                      MAP(imm_v128_shr_n_u64<60>),
+                      MAP(imm_v128_shr_n_s64<1>),
+                      MAP(imm_v128_shr_n_s64<4>),
+                      MAP(imm_v128_shr_n_s64<8>),
+                      MAP(imm_v128_shr_n_s64<12>),
+                      MAP(imm_v128_shr_n_s64<16>),
+                      MAP(imm_v128_shr_n_s64<20>),
+                      MAP(imm_v128_shr_n_s64<24>),
+                      MAP(imm_v128_shr_n_s64<28>),
+                      MAP(imm_v128_shr_n_s64<32>),
+                      MAP(imm_v128_shr_n_s64<36>),
+                      MAP(imm_v128_shr_n_s64<40>),
+                      MAP(imm_v128_shr_n_s64<44>),
+                      MAP(imm_v128_shr_n_s64<48>),
+                      MAP(imm_v128_shr_n_s64<52>),
+                      MAP(imm_v128_shr_n_s64<56>),
+                      MAP(imm_v128_shr_n_s64<60>),
                       MAP(v128_from_v64),
                       MAP(v128_zip_8),
                       MAP(v128_zip_16),
@@ -756,21 +894,29 @@
                       MAP(v128_shl_32),
                       MAP(v128_shr_u32),
                       MAP(v128_shr_s32),
+                      MAP(v128_shl_64),
+                      MAP(v128_shr_u64),
+                      MAP(v128_shr_s64),
                       MAP(v128_hadd_u8),
+                      MAP(v128_dotp_su8),
                       MAP(v128_dotp_s16),
+                      MAP(v128_dotp_s32),
                       MAP(v128_low_u32),
                       MAP(v128_low_v64),
                       MAP(v128_high_v64),
                       MAP(v128_from_64),
                       MAP(v128_from_32),
+                      MAP(v128_movemask_8),
                       MAP(v128_zero),
                       MAP(v128_dup_8),
                       MAP(v128_dup_16),
                       MAP(v128_dup_32),
+                      MAP(v128_dup_64),
                       MAP(v128_unpacklo_u8_s16),
                       MAP(v128_unpackhi_u8_s16),
                       MAP(v128_unpacklo_s8_s16),
                       MAP(v128_unpackhi_s8_s16),
+                      MAP(v128_blend_8),
                       MAP(u32_load_unaligned),
                       MAP(u32_store_unaligned),
                       MAP(v64_load_unaligned),
@@ -779,12 +925,20 @@
                       MAP(v128_store_unaligned),
                       MAP(v256_sad_u8),
                       MAP(v256_ssd_u8),
+                      MAP(v256_sad_u16),
+                      MAP(v256_ssd_s16),
                       MAP(v256_hadd_u8),
+                      MAP(v256_low_u64),
+                      MAP(v256_dotp_su8),
                       MAP(v256_dotp_s16),
+                      MAP(v256_dotp_s32),
                       MAP(v256_add_8),
                       MAP(v256_add_16),
+                      MAP(v256_sadd_s8),
+                      MAP(v256_sadd_u8),
                       MAP(v256_sadd_s16),
                       MAP(v256_add_32),
+                      MAP(v256_add_64),
                       MAP(v256_sub_8),
                       MAP(v256_ssub_u8),
                       MAP(v256_ssub_s8),
@@ -792,6 +946,7 @@
                       MAP(v256_ssub_u16),
                       MAP(v256_ssub_s16),
                       MAP(v256_sub_32),
+                      MAP(v256_sub_64),
                       MAP(v256_ziplo_8),
                       MAP(v256_ziphi_8),
                       MAP(v256_ziplo_16),
@@ -806,6 +961,9 @@
                       MAP(v256_unziplo_16),
                       MAP(v256_unziphi_32),
                       MAP(v256_unziplo_32),
+                      MAP(v256_unziphi_64),
+                      MAP(v256_unziplo_64),
+                      MAP(v256_pack_s32_u16),
                       MAP(v256_pack_s32_s16),
                       MAP(v256_pack_s16_u8),
                       MAP(v256_pack_s16_s8),
@@ -820,6 +978,7 @@
                       MAP(v256_madd_us8),
                       MAP(v256_avg_u8),
                       MAP(v256_rdavg_u8),
+                      MAP(v256_rdavg_u16),
                       MAP(v256_avg_u16),
                       MAP(v256_min_u8),
                       MAP(v256_max_u8),
@@ -827,14 +986,20 @@
                       MAP(v256_max_s8),
                       MAP(v256_min_s16),
                       MAP(v256_max_s16),
+                      MAP(v256_min_s32),
+                      MAP(v256_max_s32),
                       MAP(v256_cmpgt_s8),
                       MAP(v256_cmplt_s8),
                       MAP(v256_cmpeq_8),
                       MAP(v256_cmpgt_s16),
                       MAP(v256_cmplt_s16),
                       MAP(v256_cmpeq_16),
+                      MAP(v256_cmpgt_s32),
+                      MAP(v256_cmplt_s32),
+                      MAP(v256_cmpeq_32),
                       MAP(v256_shuffle_8),
                       MAP(v256_pshuffle_8),
+                      MAP(v256_wideshuffle_8),
                       MAP(imm_v256_align<1>),
                       MAP(imm_v256_align<2>),
                       MAP(imm_v256_align<3>),
@@ -884,8 +1049,12 @@
                       MAP(v256_shl_32),
                       MAP(v256_shr_u32),
                       MAP(v256_shr_s32),
+                      MAP(v256_shl_64),
+                      MAP(v256_shr_u64),
+                      MAP(v256_shr_s64),
                       MAP(v256_abs_s8),
                       MAP(v256_abs_s16),
+                      MAP(v256_padd_u8),
                       MAP(v256_padd_s16),
                       MAP(v256_unpacklo_u16_s32),
                       MAP(v256_unpacklo_s16_s32),
@@ -1022,10 +1191,60 @@
                       MAP(imm_v256_shr_n_s32<20>),
                       MAP(imm_v256_shr_n_s32<24>),
                       MAP(imm_v256_shr_n_s32<28>),
+                      MAP(imm_v256_shl_n_64<1>),
+                      MAP(imm_v256_shl_n_64<4>),
+                      MAP(imm_v256_shl_n_64<8>),
+                      MAP(imm_v256_shl_n_64<12>),
+                      MAP(imm_v256_shl_n_64<16>),
+                      MAP(imm_v256_shl_n_64<20>),
+                      MAP(imm_v256_shl_n_64<24>),
+                      MAP(imm_v256_shl_n_64<28>),
+                      MAP(imm_v256_shl_n_64<32>),
+                      MAP(imm_v256_shl_n_64<36>),
+                      MAP(imm_v256_shl_n_64<40>),
+                      MAP(imm_v256_shl_n_64<44>),
+                      MAP(imm_v256_shl_n_64<48>),
+                      MAP(imm_v256_shl_n_64<52>),
+                      MAP(imm_v256_shl_n_64<56>),
+                      MAP(imm_v256_shl_n_64<60>),
+                      MAP(imm_v256_shr_n_u64<1>),
+                      MAP(imm_v256_shr_n_u64<4>),
+                      MAP(imm_v256_shr_n_u64<8>),
+                      MAP(imm_v256_shr_n_u64<12>),
+                      MAP(imm_v256_shr_n_u64<16>),
+                      MAP(imm_v256_shr_n_u64<20>),
+                      MAP(imm_v256_shr_n_u64<24>),
+                      MAP(imm_v256_shr_n_u64<28>),
+                      MAP(imm_v256_shr_n_u64<32>),
+                      MAP(imm_v256_shr_n_u64<36>),
+                      MAP(imm_v256_shr_n_u64<40>),
+                      MAP(imm_v256_shr_n_u64<44>),
+                      MAP(imm_v256_shr_n_u64<48>),
+                      MAP(imm_v256_shr_n_u64<52>),
+                      MAP(imm_v256_shr_n_u64<56>),
+                      MAP(imm_v256_shr_n_u64<60>),
+                      MAP(imm_v256_shr_n_s64<1>),
+                      MAP(imm_v256_shr_n_s64<4>),
+                      MAP(imm_v256_shr_n_s64<8>),
+                      MAP(imm_v256_shr_n_s64<12>),
+                      MAP(imm_v256_shr_n_s64<16>),
+                      MAP(imm_v256_shr_n_s64<20>),
+                      MAP(imm_v256_shr_n_s64<24>),
+                      MAP(imm_v256_shr_n_s64<28>),
+                      MAP(imm_v256_shr_n_s64<32>),
+                      MAP(imm_v256_shr_n_s64<36>),
+                      MAP(imm_v256_shr_n_s64<40>),
+                      MAP(imm_v256_shr_n_s64<44>),
+                      MAP(imm_v256_shr_n_s64<48>),
+                      MAP(imm_v256_shr_n_s64<52>),
+                      MAP(imm_v256_shr_n_s64<56>),
+                      MAP(imm_v256_shr_n_s64<60>),
+                      MAP(v256_movemask_8),
                       MAP(v256_zero),
                       MAP(v256_dup_8),
                       MAP(v256_dup_16),
                       MAP(v256_dup_32),
+                      MAP(v256_dup_64),
                       MAP(v256_low_u32),
                       MAP(v256_low_v64),
                       MAP(v256_from_64),
@@ -1036,6 +1255,7 @@
                       MAP(v256_unpackhi_u8_s16),
                       MAP(v256_unpacklo_s8_s16),
                       MAP(v256_unpackhi_s8_s16),
+                      MAP(v256_blend_8),
                       { NULL, NULL, NULL } };
 #undef MAP
 
@@ -1052,7 +1272,7 @@
   *simd = m[i].simd;
 }
 
-// Used for printing errors in TestSimd1Arg and TestSimd2Args
+// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
 std::string Print(const uint8_t *a, int size) {
   std::string text = "0x";
   for (int i = 0; i < size; i++) {
@@ -1065,7 +1285,8 @@
   return text;
 }
 
-// Used in TestSimd1Arg and TestSimd2Args to restrict argument ranges
+// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
+// ranges
 void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
   switch (maskwidth) {
     case 0: {
@@ -1143,16 +1364,16 @@
   return *(reinterpret_cast<const uint8_t *>(p));
 }
 
-// CompareSimd1Arg and CompareSimd2Args compare intrinsics taking 1 or
-// 2 arguments respectively with their corresponding C reference.
-// Ideally, the loads and stores should have gone into the template
-// parameter list, but v64 and v128 could be typedef'ed to the same
-// type (which is the case on x86) and then we can't instantiate both
-// v64 and v128, so the function return and argument types, including
-// the always differing types in the C equivalent are used instead.
-// The function arguments must be void pointers and then go through a
-// cast to avoid matching errors in the branches eliminated by the
-// typeid tests in the calling function.
+// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
+// intrinsics taking 1, 2 or 3 arguments respectively with their
+// corresponding C reference.  Ideally, the loads and stores should
+// have gone into the template parameter list, but v64 and v128 could
+// be typedef'ed to the same type (which is the case on x86) and then
+// we can't instantiate both v64 and v128, so the function return and
+// argument types, including the always differing types in the C
+// equivalent are used instead.  The function arguments must be void
+// pointers and then go through a cast to avoid matching errors in the
+// branches eliminated by the typeid tests in the calling function.
 template <typename Ret, typename Arg, typename CRet, typename CArg>
 int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
                     fptr c_load, fptr c_simd, void *ref_d, const void *a) {
@@ -1195,6 +1416,35 @@
   return memcmp(ref_d, d, sizeof(CRet));
 }
 
+template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
+          typename CRet, typename CArg1, typename CArg2, typename CArg3>
+int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
+                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
+                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
+                     const void *b, const void *c) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
+  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CArg2 (*const my_c_load3)(const void *) =
+      (CArg2(*const)(const void *))c_load3;
+  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
+      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
+  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
 }  // namespace
 
 template <typename CRet, typename CArg>
@@ -1357,6 +1607,14 @@
           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
           reinterpret_cast<fptr>(c_v128_store_aligned),
           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V128_U64
+      error = CompareSimd1Arg<v128, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
     } else if (typeid(CRet) == typeid(c_v256) &&
                typeid(CArg) == typeid(c_v256)) {
       // V256_V256
@@ -1397,6 +1655,14 @@
           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
           reinterpret_cast<fptr>(c_v256_store_aligned),
           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V256_U64
+      error = CompareSimd1Arg<v256, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
     } else if (typeid(CRet) == typeid(uint32_t) &&
                typeid(CArg) == typeid(c_v256)) {
       // U32_V256
@@ -1535,6 +1801,18 @@
           reinterpret_cast<fptr>(c_v128_load_aligned),
           reinterpret_cast<fptr>(c_v128_load_aligned),
           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U64_V128V128
+      error = CompareSimd2Args<uint64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
     } else if (typeid(CRet) == typeid(int64_t) &&
                typeid(CArg1) == typeid(c_v128) &&
                typeid(CArg2) == typeid(c_v128)) {
@@ -1595,6 +1873,18 @@
           reinterpret_cast<fptr>(c_v256_load_aligned),
           reinterpret_cast<fptr>(c_v256_load_aligned),
           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U64_V256V256
+      error = CompareSimd2Args<uint64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
     } else if (typeid(CRet) == typeid(int64_t) &&
                typeid(CArg1) == typeid(c_v256) &&
                typeid(CArg2) == typeid(c_v256)) {
@@ -1657,6 +1947,81 @@
                       << Print(ref_d, sizeof(ref_d)) << " (ref)";
 }
 
+template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[sizeof(CArg1)]);
+  DECLARE_ALIGNED(32, uint8_t, s2[sizeof(CArg2)]);
+  DECLARE_ALIGNED(32, uint8_t, s3[sizeof(CArg3)]);
+  DECLARE_ALIGNED(32, uint8_t, d[sizeof(CRet)]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[sizeof(CRet)]);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
+        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
+      // V128_V128V128V128
+      error =
+          CompareSimd3Args<v128, v128, v128, v128, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v128_store_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v128_store_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256) &&
+               typeid(CArg3) == typeid(c_v256)) {
+      // V256_V256V256V256
+      error =
+          CompareSimd3Args<v256, v256, v256, v256, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v256_store_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v256_store_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
+             << typeid(CArg3).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
+                      << ", " << Print(s3, sizeof(s3)) << ") -> "
+                      << Print(d, sizeof(d)) << " (simd), "
+                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+}
+
 // Instantiations to make the functions callable from another files
 template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
                                            const char *);
@@ -1692,6 +2057,8 @@
                                              const char *);
 template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
+template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
 template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
                                           const char *);
 template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
@@ -1708,10 +2075,15 @@
                                                         uint32_t, const char *);
 template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
                                                   const char *);
+template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
 template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
                                                      uint32_t, const char *);
 template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
 template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
                                            const char *);
 template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
@@ -1724,6 +2096,8 @@
                                              const char *);
 template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
+template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
 template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
                                              const char *);
 template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
@@ -1734,9 +2108,14 @@
                                                     uint32_t, const char *);
 template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
 template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
                                                      uint32_t, const char *);
 template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
 
 }  // namespace SIMD_NAMESPACE