Misc v64/v128/v256 intrinsics cleanup and robustification

Change-Id: I3b7adfb313f9bcf709831ca74a677b86279cf421
diff --git a/aom_dsp/simd/v128_intrinsics.h b/aom_dsp/simd/v128_intrinsics.h
index 01dbb8f..20808d4 100644
--- a/aom_dsp/simd/v128_intrinsics.h
+++ b/aom_dsp/simd/v128_intrinsics.h
@@ -54,28 +54,31 @@
   return c_v128_align(a, b, c);
 }
 
-SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
+SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); }
 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
 SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
 
-typedef uint32_t sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) {
+  return c_v128_sad_u8_init();
+}
+SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) {
   return c_v128_sad_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) {
   return c_v128_sad_u8_sum(s);
 }
-typedef uint32_t ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) {
+  return c_v128_ssd_u8_init();
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) {
   return c_v128_ssd_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) {
   return c_v128_ssd_u8_sum(s);
 }
+}
 SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
   return c_v128_dotp_su8(a, b);
 }
@@ -318,7 +321,7 @@
 }
 
 typedef uint32_t sad128_internal_u16;
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
   return c_v128_sad_u16_init();
 }
 SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
@@ -330,7 +333,7 @@
 }
 
 typedef uint64_t ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) {
   return c_v128_ssd_s16_init();
 }
 SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 3c669d5..2d497f4 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -68,9 +68,11 @@
 #endif
 }
 
-SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
+SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
 
-SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
+SIMD_INLINE v128 v128_ones(void) {
+  return vreinterpretq_s64_u8(vdupq_n_u8(-1));
+}
 
 SIMD_INLINE v128 v128_dup_8(uint8_t x) {
   return vreinterpretq_s64_u8(vdupq_n_u8(x));
@@ -136,7 +138,7 @@
   sad64_internal hi, lo;
 } sad128_internal;
 
-SIMD_INLINE sad128_internal v128_sad_u8_init() {
+SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
   sad128_internal s;
   s.hi = s.lo = vdupq_n_u16(0);
   return s;
@@ -165,7 +167,7 @@
   ssd64_internal hi, lo;
 } ssd128_internal;
 
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
+SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
   ssd128_internal s;
   s.hi = s.lo = v64_ssd_u8_init();
   return s;
@@ -784,68 +786,79 @@
 }
 
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                        n * 8),
-                   (uint64_t)vorr_u64(
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  (8 - n) * 8)))
-             : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
-                                             vget_high_s64(a)))
-                       : v128_from_64(
-                             0, (uint64_t)vshr_n_u64(
-                                    vreinterpret_u64_s64(vget_high_s64(a)),
-                                    (n - 8) * 8)));
+  return n == 0
+             ? a
+             : (n < 8
+                    ? v128_from_64(
+                          (uint64_t)vshr_n_u64(
+                              vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
+                          (uint64_t)vorr_u64(
+                              vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                         n * 8),
+                              vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                         (8 - n) * 8)))
+                    : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+                                                    vget_high_s64(a)))
+                              : v128_from_64(0, (uint64_t)vshr_n_u64(
+                                                    vreinterpret_u64_s64(
+                                                        vget_high_s64(a)),
+                                                    (n - 8) * 8))));
 }
 
 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
+  return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
+  return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
+  return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
+  return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
+  return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
+  return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
+  return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
+  return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
+  return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+  return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+  return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
+           : a;
 }
 
 SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
-  return vshrq_n_s64(a, c);
+  return c ? vshrq_n_s64(a, c) : a;
 }
 
 #else
@@ -920,7 +933,9 @@
 
 typedef uint32x4_t sad128_internal_u16;
 
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+  return vdupq_n_u32(0);
+}
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_sad_u16_sum(). */
@@ -939,7 +954,7 @@
 }
 
 typedef v128 ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index bbe9a9d..466a41e 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -93,7 +93,7 @@
   c_v128_store_unaligned(p, a);
 }
 
-SIMD_INLINE c_v128 c_v128_zero() {
+SIMD_INLINE c_v128 c_v128_zero(void) {
   c_v128 t;
   t.u64[1] = t.u64[0] = 0;
   return t;
@@ -145,26 +145,39 @@
   return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
 }
 
-typedef uint32_t c_sad128_internal;
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad128_internal;
 
-SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
+  c_sad128_internal t;
+  t.val = t.count = 0;
+  return t;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
+ * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
+ * undefined. */
 SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
                                             c_v128 b) {
   int c;
   for (c = 0; c < 16; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
   return s;
 }
 
-SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
 
 typedef uint32_t c_ssd128_internal;
 
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_u8_sum(). */
@@ -720,6 +733,7 @@
 }
 
 SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
   if (n < 8)
     return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
                                     c_v64_shr_n_byte(a.v64[0], 8 - n)),
@@ -729,6 +743,7 @@
 }
 
 SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
   if (n < 8)
     return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
                            c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
@@ -854,7 +869,7 @@
 
 typedef uint32_t c_sad128_internal_u16;
 
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_sad_u16_sum(). */
@@ -870,7 +885,7 @@
 
 typedef uint64_t c_ssd128_internal_s16;
 
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index fc3e5a2..c404015 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -45,7 +45,7 @@
 
 SIMD_INLINE v128 v128_load_unaligned(const void *p) {
 #if defined(__SSSE3__)
-  return (__m128i)_mm_lddqu_si128((__m128i *)p);
+  return _mm_lddqu_si128((__m128i *)p);
 #else
   return _mm_loadu_si128((__m128i *)p);
 #endif
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
index cb99d35..b66b4bb 100644
--- a/aom_dsp/simd/v256_intrinsics.h
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -57,29 +57,42 @@
   return c_v256_align(a, b, c);
 }
 
-SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); }
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
 SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
 
-typedef uint32_t sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) {
+  return c_v256_sad_u8_init();
+}
+SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) {
   return c_v256_sad_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) {
   return c_v256_sad_u8_sum(s);
 }
-typedef uint32_t ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) {
+  return c_v256_ssd_u8_init();
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) {
   return c_v256_ssd_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) {
   return c_v256_ssd_u8_sum(s);
 }
 
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a,
+                                               v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
+
 SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
   return c_v256_dotp_su8(a, b);
 }
@@ -350,7 +363,7 @@
 }
 
 typedef uint32_t sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
   return c_v256_sad_u16_init();
 }
 SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
@@ -361,16 +374,16 @@
   return c_v256_sad_u16_sum(s);
 }
 
-typedef uint64_t ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
-  return c_v256_ssd_s16_init();
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+  return c_v256_sad_u16_init();
 }
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
                                              v256 b) {
-  return c_v256_ssd_s16(s, a, b);
+  return c_v256_sad_u16(s, a, b);
 }
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
-  return c_v256_ssd_s16_sum(s);
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
 }
 
 #endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index a1c08e9..8127ee3 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -149,9 +149,16 @@
   return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
 }
 
-typedef uint32_t c_sad256_internal;
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad256_internal;
 
-SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
+  c_sad256_internal t;
+  t.val = t.count = 0;
+  return t;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
    v256_sad_u8_sum().
@@ -160,11 +167,17 @@
                                             c_v256 b) {
   int c;
   for (c = 0; c < 32; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
   return s;
 }
 
-SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
 
 typedef uint32_t c_ssd256_internal;
 
@@ -746,6 +759,7 @@
 }
 
 SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
   if (n < 16)
     return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
                                       c_v128_shr_n_byte(a.v128[0], 16 - n)),
@@ -758,6 +772,7 @@
 }
 
 SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
   if (n < 16)
     return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
                             c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index d5b7905..0d22667 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -73,7 +73,7 @@
   v128_store_aligned((uint8_t *)p + 16, a.val[1]);
 }
 
-SIMD_INLINE v256 v256_zero() {
+SIMD_INLINE v256 v256_zero(void) {
   return v256_from_v128(v128_zero(), v128_zero());
 }
 
@@ -117,7 +117,7 @@
   sad128_internal val[2];
 } sad256_internal;
 
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
   sad256_internal t;
   t.val[1] = v128_sad_u8_init();
   t.val[0] = v128_sad_u8_init();
@@ -142,7 +142,7 @@
   ssd128_internal val[2];
 } ssd256_internal;
 
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
   ssd256_internal t;
   t.val[1] = v128_ssd_u8_init();
   t.val[0] = v128_ssd_u8_init();
@@ -780,13 +780,16 @@
                   (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
                   v128_zero()))
 
-#define v256_shr_n_byte(a, n)                                              \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
-                             v128_or(v128_shr_n_byte(a.val[0], n),         \
-                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
-            : v256_from_v128(                                              \
-                  v128_zero(),                                             \
-                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
+#define v256_shr_n_byte(a, n)                                                \
+  (n == 0                                                                    \
+       ? a                                                                   \
+       : ((n) < 16                                                           \
+              ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                               v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                       v128_shl_n_byte(a.val[1], 16 - (n)))) \
+              : v256_from_v128(                                              \
+                    v128_zero(),                                             \
+                    (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
 
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
@@ -823,7 +826,7 @@
   sad128_internal_u16 val[2];
 } sad256_internal_u16;
 
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
   sad256_internal_u16 t;
   t.val[1] = v128_sad_u16_init();
   t.val[0] = v128_sad_u16_init();
@@ -849,7 +852,7 @@
   ssd128_internal_s16 val[2];
 } ssd256_internal_s16;
 
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
   ssd256_internal_s16 t;
   t.val[1] = v128_ssd_s16_init();
   t.val[0] = v128_ssd_s16_init();
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 8207cba..5983cb8 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -57,7 +57,7 @@
 }
 
 SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+  return _mm256_set_epi64x(a, b, c, d);
 }
 
 SIMD_INLINE v256 v256_load_aligned(const void *p) {
@@ -76,7 +76,7 @@
   _mm256_storeu_si256((__m256i *)p, a);
 }
 
-SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
 
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
 
@@ -187,11 +187,11 @@
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+  return _mm256_permute2x128_si256(a, b, 0x02);
 }
 
 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+  return _mm256_permute2x128_si256(a, b, 0x13);
 }
 
 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
@@ -256,9 +256,7 @@
       _MM_SHUFFLE(3, 1, 2, 0));
 }
 
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
   return _mm256_unpacklo_epi8(
@@ -311,11 +309,11 @@
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+  return _mm256_cvtepu16_epi32(a);
 }
 
 SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+  return _mm256_cvtepi16_epi32(a);
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
@@ -442,7 +440,7 @@
 
 typedef v256 sad256_internal;
 
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
   return _mm256_setzero_si256();
 }
 
@@ -460,7 +458,7 @@
 
 typedef v256 ssd256_internal;
 
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
   return _mm256_setzero_si256();
 }
 
@@ -646,7 +644,7 @@
 }
 
 SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
-#if defined(__AVX512F__)
+#if defined(__AVX512VL__)
   return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
 #else
   return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
@@ -670,13 +668,15 @@
   ((n) < 16                                                                  \
        ? _mm256_alignr_epi8(                                                 \
              _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
-       : _mm256_inserti128_si256(                                            \
-             _mm256_setzero_si256(),                                         \
-             v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
+       : ((n) == 16                                                          \
+              ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3)      \
+              : _mm256_inserti128_si256(                                     \
+                    _mm256_setzero_si256(),                                  \
+                    v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)))
 
 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
 #define v256_align(a, b, c) \
-  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
 #define v256_shl_n_8(a, c)                                   \
   _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
@@ -701,7 +701,7 @@
 
 typedef v256 sad256_internal_u16;
 
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v256_sad_u16_sum(). */
@@ -728,7 +728,7 @@
 
 typedef v256 ssd256_internal_s16;
 
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v256_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v64_intrinsics.h b/aom_dsp/simd/v64_intrinsics.h
index afc5542..7079949 100644
--- a/aom_dsp/simd/v64_intrinsics.h
+++ b/aom_dsp/simd/v64_intrinsics.h
@@ -65,7 +65,7 @@
   return c_v64_align(a, b, c);
 }
 
-SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
+SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); }
 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
 SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
 SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
@@ -128,20 +128,22 @@
   return c_v64_shuffle_8(a, pattern);
 }
 
-typedef uint32_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) {
+  return c_v64_sad_u8_init();
+}
+SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) {
   return c_v64_sad_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) {
   return c_v64_sad_u8_sum(s);
 }
-typedef uint32_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) {
+  return c_v64_ssd_u8_init();
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) {
   return c_v64_ssd_u8(s, a, b);
 }
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) {
   return c_v64_ssd_u8_sum(s);
 }
 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index 0f661a1..a4ecdf4 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -111,7 +111,7 @@
 #endif
 }
 
-SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
+SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); }
 
 SIMD_INLINE v64 v64_dup_8(uint8_t x) {
   return vreinterpret_s64_u8(vdup_n_u8(x));
@@ -162,7 +162,7 @@
 
 typedef uint16x8_t sad64_internal;
 
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); }
 
 // Implementation dependent return value. Result must be finalised with
 // v64_sad_u8_sum().
@@ -181,7 +181,7 @@
 
 typedef uint32x4_t ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); }
 
 // Implementation dependent return value. Result must be finalised with
 // v64_ssd_u8_sum().
@@ -608,39 +608,39 @@
 }
 
 SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
+  return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
+  return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
-  return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
+  return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
+  return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
+  return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
+  return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
+  return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
+  return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a;
 }
 
 SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
+  return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a;
 }
 
 #else
diff --git a/aom_dsp/simd/v64_intrinsics_c.h b/aom_dsp/simd/v64_intrinsics_c.h
index 090c448..b84f243 100644
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -143,7 +143,7 @@
   c_v64_store_unaligned(p, a);
 }
 
-SIMD_INLINE c_v64 c_v64_zero() {
+SIMD_INLINE c_v64 c_v64_zero(void) {
   c_v64 t;
   t.u64 = 0;
   return t;
@@ -601,28 +601,41 @@
   return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
 }
 
-typedef uint32_t c_sad64_internal;
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad64_internal;
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
+  c_sad64_internal t;
+  t.val = t.count = 0;
+  return t;
+}
 
 /* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
-
+   v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
+   undefined. */
 SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
                                           c_v64 b) {
   int c;
   for (c = 0; c < 8; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
   return s;
 }
 
-SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
 
 typedef uint32_t c_ssd64_internal;
 
 /* Implementation dependent return value.  Result must be finalised with
  * v64_ssd_u8_sum(). */
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
 
 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
                                           c_v64 b) {
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 8c7b4c4..1f273fe 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -99,7 +99,7 @@
        : (b))
 #endif
 
-SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
+SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
 
 SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
 
@@ -319,7 +319,7 @@
 
 typedef v64 sad64_internal;
 
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
 
 /* Implementation dependent return value.  Result must be finalised with
    v64_sad_u8_sum().
@@ -332,7 +332,7 @@
 
 typedef v64 ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v64_ssd_u8_sum(). */
@@ -433,7 +433,7 @@
 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
                        _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
 }