Misc v64/v128/v256 intrinsics cleanup and robustification
Change-Id: I3b7adfb313f9bcf709831ca74a677b86279cf421
diff --git a/aom_dsp/simd/v128_intrinsics.h b/aom_dsp/simd/v128_intrinsics.h
index 01dbb8f..20808d4 100644
--- a/aom_dsp/simd/v128_intrinsics.h
+++ b/aom_dsp/simd/v128_intrinsics.h
@@ -54,28 +54,31 @@
return c_v128_align(a, b, c);
}
-SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
+SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); }
SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
-typedef uint32_t sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) {
+ return c_v128_sad_u8_init();
+}
+SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) {
return c_v128_sad_u8(s, a, b);
}
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) {
return c_v128_sad_u8_sum(s);
}
-typedef uint32_t ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) {
+ return c_v128_ssd_u8_init();
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) {
return c_v128_ssd_u8(s, a, b);
}
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) {
return c_v128_ssd_u8_sum(s);
}
+}
SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
return c_v128_dotp_su8(a, b);
}
@@ -318,7 +321,7 @@
}
typedef uint32_t sad128_internal_u16;
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
return c_v128_sad_u16_init();
}
SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
@@ -330,7 +333,7 @@
}
typedef uint64_t ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) {
return c_v128_ssd_s16_init();
}
SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 3c669d5..2d497f4 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -68,9 +68,11 @@
#endif
}
-SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
+SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
-SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
+SIMD_INLINE v128 v128_ones(void) {
+ return vreinterpretq_s64_u8(vdupq_n_u8(-1));
+}
SIMD_INLINE v128 v128_dup_8(uint8_t x) {
return vreinterpretq_s64_u8(vdupq_n_u8(x));
@@ -136,7 +138,7 @@
sad64_internal hi, lo;
} sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() {
+SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
sad128_internal s;
s.hi = s.lo = vdupq_n_u16(0);
return s;
@@ -165,7 +167,7 @@
ssd64_internal hi, lo;
} ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
+SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
ssd128_internal s;
s.hi = s.lo = v64_ssd_u8_init();
return s;
@@ -784,68 +786,79 @@
}
SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
- return n < 8
- ? v128_from_64(
- (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
- n * 8),
- (uint64_t)vorr_u64(
- vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
- vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
- (8 - n) * 8)))
- : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
- vget_high_s64(a)))
- : v128_from_64(
- 0, (uint64_t)vshr_n_u64(
- vreinterpret_u64_s64(vget_high_s64(a)),
- (n - 8) * 8)));
+ return n == 0
+ ? a
+ : (n < 8
+ ? v128_from_64(
+ (uint64_t)vshr_n_u64(
+ vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
+ (uint64_t)vorr_u64(
+ vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+ n * 8),
+ vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+ (8 - n) * 8)))
+ : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+ vget_high_s64(a)))
+ : v128_from_64(0, (uint64_t)vshr_n_u64(
+ vreinterpret_u64_s64(
+ vget_high_s64(a)),
+ (n - 8) * 8))));
}
SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
- return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
+ return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
}
SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
- return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
+ return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
}
SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
- return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
+ return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
}
SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
- return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
+ return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
- return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
+ return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
- return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
+ return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
- return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
+ return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
- return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
+ return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
- return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
+ return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
- return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+ return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
- return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+ return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
+ : a;
}
SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
- return vshrq_n_s64(a, c);
+ return c ? vshrq_n_s64(a, c) : a;
}
#else
@@ -920,7 +933,9 @@
typedef uint32x4_t sad128_internal_u16;
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+ return vdupq_n_u32(0);
+}
/* Implementation dependent return value. Result must be finalised with
* v128_sad_u16_sum(). */
@@ -939,7 +954,7 @@
}
typedef v128 ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
/* Implementation dependent return value. Result must be finalised with
* v128_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index bbe9a9d..466a41e 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -93,7 +93,7 @@
c_v128_store_unaligned(p, a);
}
-SIMD_INLINE c_v128 c_v128_zero() {
+SIMD_INLINE c_v128 c_v128_zero(void) {
c_v128 t;
t.u64[1] = t.u64[0] = 0;
return t;
@@ -145,26 +145,39 @@
return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
}
-typedef uint32_t c_sad128_internal;
+typedef struct {
+ uint32_t val;
+ int count;
+} c_sad128_internal;
-SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
+ c_sad128_internal t;
+ t.val = t.count = 0;
+ return t;
+}
/* Implementation dependent return value. Result must be finalised with
- v128_sad_u8_sum().
- The result for more than 32 v128_sad_u8() calls is undefined. */
+ * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
+ * undefined. */
SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
c_v128 b) {
int c;
for (c = 0; c < 16; c++)
- s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.count++;
+ if (SIMD_CHECK && s.count > 32) {
+ fprintf(stderr,
+ "Error: sad called 32 times returning an undefined result\n");
+ abort();
+ }
return s;
}
-SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
typedef uint32_t c_ssd128_internal;
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
/* Implementation dependent return value. Result must be finalised with
* v128_ssd_u8_sum(). */
@@ -720,6 +733,7 @@
}
SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+ if (n == 0) return a;
if (n < 8)
return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
c_v64_shr_n_byte(a.v64[0], 8 - n)),
@@ -729,6 +743,7 @@
}
SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+ if (n == 0) return a;
if (n < 8)
return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
@@ -854,7 +869,7 @@
typedef uint32_t c_sad128_internal_u16;
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
/* Implementation dependent return value. Result must be finalised with
* v128_sad_u16_sum(). */
@@ -870,7 +885,7 @@
typedef uint64_t c_ssd128_internal_s16;
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
/* Implementation dependent return value. Result must be finalised with
* v128_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index fc3e5a2..c404015 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -45,7 +45,7 @@
SIMD_INLINE v128 v128_load_unaligned(const void *p) {
#if defined(__SSSE3__)
- return (__m128i)_mm_lddqu_si128((__m128i *)p);
+ return _mm_lddqu_si128((__m128i *)p);
#else
return _mm_loadu_si128((__m128i *)p);
#endif
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
index cb99d35..b66b4bb 100644
--- a/aom_dsp/simd/v256_intrinsics.h
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -57,29 +57,42 @@
return c_v256_align(a, b, c);
}
-SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); }
SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
-typedef uint32_t sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) {
+ return c_v256_sad_u8_init();
+}
+SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) {
return c_v256_sad_u8(s, a, b);
}
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) {
return c_v256_sad_u8_sum(s);
}
-typedef uint32_t ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) {
+ return c_v256_ssd_u8_init();
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) {
return c_v256_ssd_u8(s, a, b);
}
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) {
return c_v256_ssd_u8_sum(s);
}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) {
+ return c_v256_ssd_s16_init();
+}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a,
+ v256 b) {
+ return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) {
+ return c_v256_ssd_s16_sum(s);
+}
+
SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
return c_v256_dotp_su8(a, b);
}
@@ -350,7 +363,7 @@
}
typedef uint32_t sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
return c_v256_sad_u16_init();
}
SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
@@ -361,16 +374,16 @@
return c_v256_sad_u16_sum(s);
}
-typedef uint64_t ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
- return c_v256_ssd_s16_init();
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+ return c_v256_sad_u16_init();
}
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
v256 b) {
- return c_v256_ssd_s16(s, a, b);
+ return c_v256_sad_u16(s, a, b);
}
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
- return c_v256_ssd_s16_sum(s);
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+ return c_v256_sad_u16_sum(s);
}
#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index a1c08e9..8127ee3 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -149,9 +149,16 @@
return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
}
-typedef uint32_t c_sad256_internal;
+typedef struct {
+ uint32_t val;
+ int count;
+} c_sad256_internal;
-SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
+ c_sad256_internal t;
+ t.val = t.count = 0;
+ return t;
+}
/* Implementation dependent return value. Result must be finalised with
v256_sad_u8_sum().
@@ -160,11 +167,17 @@
c_v256 b) {
int c;
for (c = 0; c < 32; c++)
- s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.count++;
+ if (SIMD_CHECK && s.count > 32) {
+ fprintf(stderr,
+ "Error: sad called 32 times returning an undefined result\n");
+ abort();
+ }
return s;
}
-SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
typedef uint32_t c_ssd256_internal;
@@ -746,6 +759,7 @@
}
SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+ if (n == 0) return a;
if (n < 16)
return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
c_v128_shr_n_byte(a.v128[0], 16 - n)),
@@ -758,6 +772,7 @@
}
SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+ if (n == 0) return a;
if (n < 16)
return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index d5b7905..0d22667 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -73,7 +73,7 @@
v128_store_aligned((uint8_t *)p + 16, a.val[1]);
}
-SIMD_INLINE v256 v256_zero() {
+SIMD_INLINE v256 v256_zero(void) {
return v256_from_v128(v128_zero(), v128_zero());
}
@@ -117,7 +117,7 @@
sad128_internal val[2];
} sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
sad256_internal t;
t.val[1] = v128_sad_u8_init();
t.val[0] = v128_sad_u8_init();
@@ -142,7 +142,7 @@
ssd128_internal val[2];
} ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
ssd256_internal t;
t.val[1] = v128_ssd_u8_init();
t.val[0] = v128_ssd_u8_init();
@@ -780,13 +780,16 @@
(n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
v128_zero()))
-#define v256_shr_n_byte(a, n) \
- ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \
- v128_or(v128_shr_n_byte(a.val[0], n), \
- v128_shl_n_byte(a.val[1], 16 - (n)))) \
- : v256_from_v128( \
- v128_zero(), \
- (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
+#define v256_shr_n_byte(a, n) \
+ (n == 0 \
+ ? a \
+ : ((n) < 16 \
+ ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \
+ v128_or(v128_shr_n_byte(a.val[0], n), \
+ v128_shl_n_byte(a.val[1], 16 - (n)))) \
+ : v256_from_v128( \
+ v128_zero(), \
+ (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
#define v256_align(a, b, c) \
((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
@@ -823,7 +826,7 @@
sad128_internal_u16 val[2];
} sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
sad256_internal_u16 t;
t.val[1] = v128_sad_u16_init();
t.val[0] = v128_sad_u16_init();
@@ -849,7 +852,7 @@
ssd128_internal_s16 val[2];
} ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
ssd256_internal_s16 t;
t.val[1] = v128_ssd_s16_init();
t.val[0] = v128_ssd_s16_init();
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 8207cba..5983cb8 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -57,7 +57,7 @@
}
SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
- return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+ return _mm256_set_epi64x(a, b, c, d);
}
SIMD_INLINE v256 v256_load_aligned(const void *p) {
@@ -76,7 +76,7 @@
_mm256_storeu_si256((__m256i *)p, a);
}
-SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
@@ -187,11 +187,11 @@
}
SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
- return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+ return _mm256_permute2x128_si256(a, b, 0x02);
}
SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
- return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+ return _mm256_permute2x128_si256(a, b, 0x13);
}
SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
@@ -256,9 +256,7 @@
_MM_SHUFFLE(3, 1, 2, 0));
}
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
- return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
return _mm256_unpacklo_epi8(
@@ -311,11 +309,11 @@
}
SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
- return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+ return _mm256_cvtepu16_epi32(a);
}
SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
- return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+ return _mm256_cvtepi16_epi32(a);
}
SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
@@ -442,7 +440,7 @@
typedef v256 sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
return _mm256_setzero_si256();
}
@@ -460,7 +458,7 @@
typedef v256 ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
return _mm256_setzero_si256();
}
@@ -646,7 +644,7 @@
}
SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
-#if defined(__AVX512F__)
+#if defined(__AVX512VL__)
return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
#else
return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
@@ -670,13 +668,15 @@
((n) < 16 \
? _mm256_alignr_epi8( \
_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
- : _mm256_inserti128_si256( \
- _mm256_setzero_si256(), \
- v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
+ : ((n) == 16 \
+ ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \
+ : _mm256_inserti128_si256( \
+ _mm256_setzero_si256(), \
+ v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)))
// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
#define v256_align(a, b, c) \
- ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
#define v256_shl_n_8(a, c) \
_mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
@@ -701,7 +701,7 @@
typedef v256 sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
/* Implementation dependent return value. Result must be finalised with
* v256_sad_u16_sum(). */
@@ -728,7 +728,7 @@
typedef v256 ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
/* Implementation dependent return value. Result must be finalised with
* v256_ssd_s16_sum(). */
diff --git a/aom_dsp/simd/v64_intrinsics.h b/aom_dsp/simd/v64_intrinsics.h
index afc5542..7079949 100644
--- a/aom_dsp/simd/v64_intrinsics.h
+++ b/aom_dsp/simd/v64_intrinsics.h
@@ -65,7 +65,7 @@
return c_v64_align(a, b, c);
}
-SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
+SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); }
SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
@@ -128,20 +128,22 @@
return c_v64_shuffle_8(a, pattern);
}
-typedef uint32_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) {
+ return c_v64_sad_u8_init();
+}
+SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) {
return c_v64_sad_u8(s, a, b);
}
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) {
return c_v64_sad_u8_sum(s);
}
-typedef uint32_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) {
+ return c_v64_ssd_u8_init();
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) {
return c_v64_ssd_u8(s, a, b);
}
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) {
return c_v64_ssd_u8_sum(s);
}
SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index 0f661a1..a4ecdf4 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -111,7 +111,7 @@
#endif
}
-SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
+SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); }
SIMD_INLINE v64 v64_dup_8(uint8_t x) {
return vreinterpret_s64_u8(vdup_n_u8(x));
@@ -162,7 +162,7 @@
typedef uint16x8_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); }
// Implementation dependent return value. Result must be finalised with
// v64_sad_u8_sum().
@@ -181,7 +181,7 @@
typedef uint32x4_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); }
// Implementation dependent return value. Result must be finalised with
// v64_ssd_u8_sum().
@@ -608,39 +608,39 @@
}
SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
- return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
+ return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
- return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
+ return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
- return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
+ return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
- return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
+ return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
- return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
+ return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
- return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
+ return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
- return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
+ return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
- return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
+ return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a;
}
SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
- return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
+ return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a;
}
#else
diff --git a/aom_dsp/simd/v64_intrinsics_c.h b/aom_dsp/simd/v64_intrinsics_c.h
index 090c448..b84f243 100644
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -143,7 +143,7 @@
c_v64_store_unaligned(p, a);
}
-SIMD_INLINE c_v64 c_v64_zero() {
+SIMD_INLINE c_v64 c_v64_zero(void) {
c_v64 t;
t.u64 = 0;
return t;
@@ -601,28 +601,41 @@
return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
}
-typedef uint32_t c_sad64_internal;
+typedef struct {
+ uint32_t val;
+ int count;
+} c_sad64_internal;
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
+ c_sad64_internal t;
+ t.val = t.count = 0;
+ return t;
+}
/* Implementation dependent return value. Result must be finalised with
- v64_sad_u8_sum().
- The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
-
+ v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
+ undefined. */
SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
c_v64 b) {
int c;
for (c = 0; c < 8; c++)
- s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.count++;
+ if (SIMD_CHECK && s.count > 32) {
+ fprintf(stderr,
+ "Error: sad called 32 times returning an undefined result\n");
+ abort();
+ }
return s;
}
-SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
typedef uint32_t c_ssd64_internal;
/* Implementation dependent return value. Result must be finalised with
* v64_ssd_u8_sum(). */
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
c_v64 b) {
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 8c7b4c4..1f273fe 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -99,7 +99,7 @@
: (b))
#endif
-SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
+SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
@@ -319,7 +319,7 @@
typedef v64 sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
/* Implementation dependent return value. Result must be finalised with
v64_sad_u8_sum().
@@ -332,7 +332,7 @@
typedef v64 ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
/* Implementation dependent return value. Result must be finalised with
* v64_ssd_u8_sum(). */
@@ -433,7 +433,7 @@
SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
- return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+ return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
_mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
}