Move the SSSE3 code in variance_sse2.c to new file

Name the new file variance_ssse3.c. variance_sse2.c is now SSE2 only.

Bug: aomedia:3578
Change-Id: I9d75b6617dac011de7dcd0366fb9f8e5999ec133
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 6d8e5a9..750df42 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -267,6 +267,7 @@
               "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index e71244f..610695a 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -403,204 +403,6 @@
   return *sse;
 }
 
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int aom_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-#if HAVE_SSSE3
-DECLS(ssse3);
-#endif
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2, NULL, NULL);                                               \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
-  }
-
-#if !CONFIG_REALTIME_ONLY
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-#else
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
-#endif
-
-#if HAVE_SSSE3
-FNS(ssse3)
-#endif
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-#if HAVE_SSSE3
-DECLS(ssse3);
-#endif
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
-      const uint8_t *sec) {                                                  \
-    /*Avoid overflow in helper by capping height.*/                          \
-    const int hf = AOMMIN(h, 64);                                            \
-    unsigned int sse = 0;                                                    \
-    int se = 0;                                                              \
-    for (int i = 0; i < (w / wf); ++i) {                                     \
-      const uint8_t *src_ptr = src;                                          \
-      const uint8_t *dst_ptr = dst;                                          \
-      const uint8_t *sec_ptr = sec;                                          \
-      for (int j = 0; j < (h / hf); ++j) {                                   \
-        unsigned int sse2;                                                   \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
-            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
-        dst_ptr += hf * dst_stride;                                          \
-        src_ptr += hf * src_stride;                                          \
-        sec_ptr += hf * w;                                                   \
-        se += se2;                                                           \
-        sse += sse2;                                                         \
-      }                                                                      \
-      src += wf;                                                             \
-      dst += wf;                                                             \
-      sec += wf;                                                             \
-    }                                                                        \
-    *sse_ptr = sse;                                                          \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
-  }
-
-#if !CONFIG_REALTIME_ONLY
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-#else
-#define FNS(opt)                                    \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
-#endif
-
-#if HAVE_SSSE3
-FNS(ssse3)
-#endif
-
-#undef FNS
-#undef FN
-
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c
new file mode 100644
index 0000000..d616f43
--- /dev/null
+++ b/aom_dsp/x86/variance_ssse3.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt)                                                           \
+  int aom_sub_pixel_variance##w##xh_##opt(                                     \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
+      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+      void *unused0, void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                        \
+  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
+      void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(ssse3)
+
+#undef FNS
+#undef FN