Add v64_abs_s8, v128_abs_s8 and v256_abs_s8
Change-Id: I529509e4e997ba123799a3a581d20624d75cf582
diff --git a/aom_dsp/simd/v128_intrinsics.h b/aom_dsp/simd/v128_intrinsics.h
index 5c37d2d..b0f0a78 100644
--- a/aom_dsp/simd/v128_intrinsics.h
+++ b/aom_dsp/simd/v128_intrinsics.h
@@ -97,6 +97,7 @@
SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
+SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index 5ef46a8..7422b27 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -208,6 +208,10 @@
return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
}
+SIMD_INLINE v128 v128_abs_s8(v128 x) {
+ return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
+}
+
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
return vreinterpretq_s64_s32(
vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index 9ab6dd3..569275a 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -244,6 +244,10 @@
return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
}
+SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
+ return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
+}
+
SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
c_v64 lo_bits = c_v64_mullo_s16(a, b);
c_v64 hi_bits = c_v64_mulhi_s16(a, b);
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index bd17dd1..ce078cb 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -120,6 +120,16 @@
#endif
}
+SIMD_INLINE v128 v128_abs_s8(v128 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi8(a);
+#else
+ v128 t = _mm_sub_epi8(_mm_setzero_si128(), a);
+ v128 mask = _mm_cmplt_epi8(t, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, t), _mm_and_si128(mask, a));
+#endif
+}
+
SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
return _mm_unpacklo_epi8(b, a);
}
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
index 69be067..94b940b 100644
--- a/aom_dsp/simd/v256_intrinsics.h
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -99,6 +99,7 @@
SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index c4cb6a0..12e393a 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -253,6 +253,10 @@
return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
}
+SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
+ return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
+}
+
SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
c_v128 lo_bits = c_v128_mullo_s16(a, b);
c_v128 hi_bits = c_v128_mulhi_s16(a, b);
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index 6721417..b009bad 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -211,6 +211,10 @@
return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
}
+SIMD_INLINE v256 v256_abs_s8(v256 a) {
+ return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo));
+}
+
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
v128 lo_bits = v128_mullo_s16(a, b);
v128 hi_bits = v128_mulhi_s16(a, b);
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 06bce52..bd74cee 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -118,6 +118,8 @@
SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
+
// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
// lanes of lower or upper halves of a 256bit vector because the
// unpack/pack intrinsics operate on the 256 bit input vector as 2
diff --git a/aom_dsp/simd/v64_intrinsics.h b/aom_dsp/simd/v64_intrinsics.h
index fc59d7d..e51939a 100644
--- a/aom_dsp/simd/v64_intrinsics.h
+++ b/aom_dsp/simd/v64_intrinsics.h
@@ -81,6 +81,7 @@
SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
+SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index 7342059..cc7f6ff 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -242,6 +242,10 @@
return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
}
+SIMD_INLINE v64 v64_abs_s8(v64 x) {
+ return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
+}
+
SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
return vreinterpret_s64_s16(
vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
diff --git a/aom_dsp/simd/v64_intrinsics_c.h b/aom_dsp/simd/v64_intrinsics_c.h
index 7475c51..d64f44b 100644
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -264,6 +264,13 @@
return t;
}
+SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
+ return t;
+}
+
SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
c_v64 t;
if (mode) {
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 84394a4..b0296fd 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -138,6 +138,16 @@
#endif
}
+SIMD_INLINE v64 v64_abs_s8(v64 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi8(a);
+#else
+ v64 t = _mm_sub_epi8(_mm_setzero_si128(), a);
+ v64 mask = _mm_cmplt_epi8(t, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, t), _mm_and_si128(mask, a));
+#endif
+}
+
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {