Move the SSSE3 code in variance_sse2.c to new file
Name the new file variance_ssse3.c. variance_sse2.c is now SSE2 only.
Bug: aomedia:3578
Change-Id: I9d75b6617dac011de7dcd0366fb9f8e5999ec133
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 6d8e5a9..750df42 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -267,6 +267,7 @@
"${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c")
list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index e71244f..610695a 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -403,204 +403,6 @@
return *sse;
}
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt) \
- int aom_sub_pixel_variance##w##xh_##opt( \
- const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
- void *unused0, void *unused)
-#define DECLS(opt) \
- DECL(4, opt); \
- DECL(8, opt); \
- DECL(16, opt)
-
-#if HAVE_SSSE3
-DECLS(ssse3);
-#endif
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
- unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
- /*Avoid overflow in helper by capping height.*/ \
- const int hf = AOMMIN(h, 64); \
- unsigned int sse = 0; \
- int se = 0; \
- for (int i = 0; i < (w / wf); ++i) { \
- const uint8_t *src_ptr = src; \
- const uint8_t *dst_ptr = dst; \
- for (int j = 0; j < (h / hf); ++j) { \
- unsigned int sse2; \
- const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
- src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
- &sse2, NULL, NULL); \
- dst_ptr += hf * dst_stride; \
- src_ptr += hf * src_stride; \
- se += se2; \
- sse += sse2; \
- } \
- src += wf; \
- dst += wf; \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
- }
-
-#if !CONFIG_REALTIME_ONLY
-#define FNS(opt) \
- FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
- FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
- FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
- FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
- FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
- FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
- FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
- FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
- FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \
- FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
- FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
- FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
- FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
- FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
- FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-#else
-#define FNS(opt) \
- FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
- FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
- FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
- FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
- FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
- FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
- FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
- FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
- FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
-#endif
-
-#if HAVE_SSSE3
-FNS(ssse3)
-#endif
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
- int aom_sub_pixel_avg_variance##w##xh_##opt( \
- const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
- ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
- void *unused)
-#define DECLS(opt) \
- DECL(4, opt); \
- DECL(8, opt); \
- DECL(16, opt)
-
-#if HAVE_SSSE3
-DECLS(ssse3);
-#endif
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
- unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
- const uint8_t *sec) { \
- /*Avoid overflow in helper by capping height.*/ \
- const int hf = AOMMIN(h, 64); \
- unsigned int sse = 0; \
- int se = 0; \
- for (int i = 0; i < (w / wf); ++i) { \
- const uint8_t *src_ptr = src; \
- const uint8_t *dst_ptr = dst; \
- const uint8_t *sec_ptr = sec; \
- for (int j = 0; j < (h / hf); ++j) { \
- unsigned int sse2; \
- const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
- src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
- sec_ptr, w, hf, &sse2, NULL, NULL); \
- dst_ptr += hf * dst_stride; \
- src_ptr += hf * src_stride; \
- sec_ptr += hf * w; \
- se += se2; \
- sse += sse2; \
- } \
- src += wf; \
- dst += wf; \
- sec += wf; \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
- }
-
-#if !CONFIG_REALTIME_ONLY
-#define FNS(opt) \
- FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
- FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
- FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
- FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
- FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
- FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
- FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
- FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
- FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \
- FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
- FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
- FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
- FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
- FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
- FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-#else
-#define FNS(opt) \
- FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
- FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
- FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
- FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
- FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
- FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
- FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
- FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
- FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
-#endif
-
-#if HAVE_SSSE3
-FNS(ssse3)
-#endif
-
-#undef FNS
-#undef FN
-
static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
const __m128i s1,
const __m128i a) {
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c
new file mode 100644
index 0000000..d616f43
--- /dev/null
+++ b/aom_dsp/x86/variance_ssse3.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int aom_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+ void *unused0, void *unused)
+#define DECLS(opt) \
+ DECL(4, opt); \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+ &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused)
+#define DECLS(opt) \
+ DECL(4, opt); \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, hf, &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(ssse3)
+
+#undef FNS
+#undef FN