aom_dsp: remove x86inc.asm distinction

Manually cherrypicked from libvpx/master:
1b833d63d9c82270e4ea588541d14e9111c64c79
a4f3751be5f012d66011ddc1c5f12bd12734a1d3

Change-Id: I3b82e54a3173ac1458a13f33fd36094fec066f1c
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index c04d955..896b729 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -65,18 +65,14 @@
 DSP_SRCS-yes += daalaboolwriter.h
 endif
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-endif  # CONFIG_USE_X86INC
 
 ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
-endif  # CONFIG_USE_X86INC
 endif  # CONFIG_AOM_HIGHBITDEPTH
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
@@ -116,9 +112,7 @@
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_bilinear_sse2.asm
 endif
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm
-endif
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
@@ -209,10 +203,8 @@
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
-endif
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
@@ -227,12 +219,10 @@
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
 endif  # ARCH_X86_64
-endif  # CONFIG_USE_X86INC
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
@@ -284,11 +274,9 @@
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
 endif
-endif
 
 # avg
 DSP_SRCS-yes           += avg.c
@@ -296,10 +284,8 @@
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 ifeq ($(ARCH_X86_64),yes)
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
-endif
 
 endif  # CONFIG_AV1_ENCODER
 
@@ -321,7 +307,6 @@
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
@@ -332,7 +317,6 @@
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 endif  # CONFIG_AOM_HIGHBITDEPTH
-endif  # CONFIG_USE_X86INC
 
 endif  # CONFIG_ENCODERS
 
@@ -363,17 +347,13 @@
 DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
 endif  # ARCH_X86_64
 
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
-endif  # CONFIG_USE_X86INC
 
 ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
-ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
-endif  # CONFIG_USE_X86INC
 endif  # CONFIG_AOM_HIGHBITDEPTH
 
 ifeq ($(CONFIG_MOTION_VAR),yes)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 027c9fa..fc2535d 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -11,29 +11,6 @@
 }
 forward_decls qw/aom_dsp_forward_decls/;
 
-# x86inc.asm had specific constraints. break it out so it's easy to disable.
-# zero all the variables to avoid tricky else conditions.
-$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
-  $avx2_x86inc = '';
-$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
-  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
-if (aom_config("CONFIG_USE_X86INC") eq "yes") {
-  $mmx_x86inc = 'mmx';
-  $sse_x86inc = 'sse';
-  $sse2_x86inc = 'sse2';
-  $ssse3_x86inc = 'ssse3';
-  $avx_x86inc = 'avx';
-  $avx2_x86inc = 'avx2';
-  if ($opts{arch} eq "x86_64") {
-    $mmx_x86_64_x86inc = 'mmx';
-    $sse_x86_64_x86inc = 'sse';
-    $sse2_x86_64_x86inc = 'sse2';
-    $ssse3_x86_64_x86inc = 'ssse3';
-    $avx_x86_64_x86inc = 'avx';
-    $avx2_x86_64_x86inc = 'avx2';
-  }
-}
-
 # optimizations which depend on multiple features
 $avx2_ssse3 = '';
 if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
@@ -126,7 +103,7 @@
 specialize qw/aom_d63f_predictor_4x4/;
 
 add_proto qw/void aom_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_h_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 
 add_proto qw/void aom_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_he_predictor_4x4/;
@@ -138,28 +115,28 @@
 specialize qw/aom_d135_predictor_4x4 neon/;
 
 add_proto qw/void aom_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_d153_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/aom_d153_predictor_4x4 ssse3/;
 
 add_proto qw/void aom_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_v_predictor_4x4 neon msa/, "$sse2_x86inc";
+specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 
 add_proto qw/void aom_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_ve_predictor_4x4/;
 
 add_proto qw/void aom_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/;
 
 add_proto qw/void aom_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
 
 add_proto qw/void aom_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void aom_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void aom_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void aom_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_8x8/;
@@ -171,7 +148,7 @@
 specialize qw/aom_d63e_predictor_8x8/;
 
 add_proto qw/void aom_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_h_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void aom_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d117_predictor_8x8/;
@@ -180,25 +157,25 @@
 specialize qw/aom_d135_predictor_8x8/;
 
 add_proto qw/void aom_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_d153_predictor_8x8/, "$ssse3_x86inc";
+specialize qw/aom_d153_predictor_8x8 ssse3/;
 
 add_proto qw/void aom_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_v_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/aom_v_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void aom_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void aom_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_predictor_8x8 dspr2 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
 
 add_proto qw/void aom_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_top_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void aom_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_left_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void aom_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_128_predictor_8x8 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void aom_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_16x16/;
@@ -210,7 +187,7 @@
 specialize qw/aom_d63e_predictor_16x16/;
 
 add_proto qw/void aom_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_h_predictor_16x16 neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
 
 add_proto qw/void aom_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d117_predictor_16x16/;
@@ -219,25 +196,25 @@
 specialize qw/aom_d135_predictor_16x16/;
 
 add_proto qw/void aom_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_d153_predictor_16x16/, "$ssse3_x86inc";
+specialize qw/aom_d153_predictor_16x16 ssse3/;
 
 add_proto qw/void aom_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_v_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/aom_v_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void aom_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/aom_tm_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void aom_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
 
 add_proto qw/void aom_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void aom_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void aom_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
+specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
 
 add_proto qw/void aom_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_32x32/;
@@ -249,7 +226,7 @@
 specialize qw/aom_d63e_predictor_32x32/;
 
 add_proto qw/void aom_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_h_predictor_32x32 neon msa/, "$sse2_x86inc";
+specialize qw/aom_h_predictor_32x32 neon msa sse2/;
 
 add_proto qw/void aom_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d117_predictor_32x32/;
@@ -258,25 +235,25 @@
 specialize qw/aom_d135_predictor_32x32/;
 
 add_proto qw/void aom_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_d153_predictor_32x32/, "$ssse3_x86inc";
+specialize qw/aom_d153_predictor_32x32 ssse3/;
 
 add_proto qw/void aom_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_v_predictor_32x32 neon msa/, "$sse2_x86inc";
+specialize qw/aom_v_predictor_32x32 neon msa sse2/;
 
 add_proto qw/void aom_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
+specialize qw/aom_tm_predictor_32x32 neon msa sse2/;
 
 add_proto qw/void aom_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_predictor_32x32 msa neon sse2/;
 
 add_proto qw/void aom_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/;
 
 add_proto qw/void aom_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/;
 
 add_proto qw/void aom_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
+specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
 
 # High bitdepth functions
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
@@ -302,13 +279,13 @@
   specialize qw/aom_highbd_d153_predictor_4x4/;
 
   add_proto qw/void aom_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_v_predictor_4x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_v_predictor_4x4 sse2/;
 
   add_proto qw/void aom_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_4x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_tm_predictor_4x4 sse2/;
 
   add_proto qw/void aom_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_dc_predictor_4x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
 
   add_proto qw/void aom_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_top_predictor_4x4/;
@@ -341,13 +318,13 @@
   specialize qw/aom_highbd_d153_predictor_8x8/;
 
   add_proto qw/void aom_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_v_predictor_8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_v_predictor_8x8 sse2/;
 
   add_proto qw/void aom_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_tm_predictor_8x8 sse2/;
 
   add_proto qw/void aom_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
+  specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
 
   add_proto qw/void aom_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_top_predictor_8x8/;
@@ -380,13 +357,13 @@
   specialize qw/aom_highbd_d153_predictor_16x16/;
 
   add_proto qw/void aom_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_v_predictor_16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_v_predictor_16x16 sse2/;
 
   add_proto qw/void aom_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_tm_predictor_16x16 sse2/;
 
   add_proto qw/void aom_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_dc_predictor_16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
 
   add_proto qw/void aom_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_top_predictor_16x16/;
@@ -419,13 +396,13 @@
   specialize qw/aom_highbd_d153_predictor_32x32/;
 
   add_proto qw/void aom_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_v_predictor_32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_v_predictor_32x32 sse2/;
 
   add_proto qw/void aom_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_tm_predictor_32x32 sse2/;
 
   add_proto qw/void aom_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_dc_predictor_32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
 
   add_proto qw/void aom_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_top_predictor_32x32/;
@@ -441,10 +418,10 @@
 # Sub Pixel Filters
 #
 add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_convolve_copy neon dspr2 msa sse2/;
 
 add_proto qw/void aom_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
+specialize qw/aom_convolve_avg neon dspr2 msa sse2/;
 
 add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 specialize qw/aom_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
@@ -487,10 +464,10 @@
   # Sub Pixel Filters
   #
   add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve_copy/, "$sse2_x86inc";
+  specialize qw/aom_highbd_convolve_copy sse2/;
 
   add_proto qw/void aom_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_convolve_avg sse2/;
 
   add_proto qw/void aom_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/aom_highbd_convolve8/, "$sse2_x86_64";
@@ -671,7 +648,7 @@
   specialize qw/aom_fdct4x4_1 sse2/;
 
   add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc";
+  specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
 
   add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/aom_fdct8x8_1 sse2 neon msa/;
@@ -703,7 +680,7 @@
   specialize qw/aom_iwht4x4_1_add/;
 
   add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_iwht4x4_16_add/, "$sse2_x86inc";
+  specialize qw/aom_iwht4x4_16_add sse2/;
 
   add_proto qw/void aom_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/aom_highbd_idct4x4_1_add/;
@@ -789,10 +766,10 @@
     specialize qw/aom_idct4x4_1_add sse2/;
 
     add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct8x8_64_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct8x8_12_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/aom_idct8x8_1_add sse2/;
@@ -807,15 +784,15 @@
     specialize qw/aom_idct16x16_1_add sse2/;
 
     add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct32x32_1024_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct32x32_135_add sse2/, "$ssse3_x86_64";
     # Need to add 135 eob idct32x32 implementations.
     $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
 
     add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct32x32_34_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/aom_idct32x32_1_add sse2/;
@@ -890,10 +867,10 @@
     specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/;
 
     add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/aom_idct16x16_1_add sse2 neon dspr2 msa/;
@@ -905,10 +882,10 @@
     specialize qw/aom_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
     add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
     # Need to add 135 eob idct32x32 implementations.
     $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
     $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
@@ -916,7 +893,7 @@
     $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
 
     add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
+    specialize qw/aom_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64";
     # Need to add 34 eob idct32x32 neon implementation.
     $aom_idct32x32_34_add_neon_asm=aom_idct32x32_1024_add_neon;
 
@@ -927,7 +904,7 @@
     specialize qw/aom_iwht4x4_1_add msa/;
 
     add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_iwht4x4_16_add msa/, "$sse2_x86inc";
+    specialize qw/aom_iwht4x4_16_add msa sse2/;
   }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_AOM_HIGHBITDEPTH
 }  # CONFIG_AV1
@@ -950,10 +927,10 @@
 } else {
   if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+    specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
 
     add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+    specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
 
     if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -989,49 +966,49 @@
 # Block subtraction
 #
 add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/aom_subtract_block neon msa/, "$sse2_x86inc";
+specialize qw/aom_subtract_block neon msa sse2/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int aom_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad64x64 avx2 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad64x64 avx2 neon msa sse2/;
 
 add_proto qw/unsigned int aom_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad64x32 avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad64x32 avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad32x64 avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x64 avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad32x32 avx2 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x32 avx2 neon msa sse2/;
 
 add_proto qw/unsigned int aom_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad32x16 avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x16 avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad16x32 msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x32 msa sse2/;
 
 add_proto qw/unsigned int aom_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad16x16 media neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x16 media neon msa sse2/;
 
 add_proto qw/unsigned int aom_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad16x8 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x8 neon msa sse2/;
 
 add_proto qw/unsigned int aom_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad8x16 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x16 neon msa sse2/;
 
 add_proto qw/unsigned int aom_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad8x8 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x8 neon msa sse2/;
 
 add_proto qw/unsigned int aom_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad8x4 msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x4 msa sse2/;
 
 add_proto qw/unsigned int aom_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad4x8 msa/, "$sse2_x86inc";
+specialize qw/aom_sad4x8 msa sse2/;
 
 add_proto qw/unsigned int aom_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad4x4 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad4x4 neon msa sse2/;
 
 #
 # OBMC SAD
@@ -1091,7 +1068,7 @@
   specialize qw/aom_minmax_8x8 sse2/;
 
   add_proto qw/void aom_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
+  specialize qw/aom_hadamard_8x8 sse2/, "$ssse3_x86_64";
 
   add_proto qw/void aom_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
   specialize qw/aom_hadamard_16x16 sse2/;
@@ -1110,43 +1087,43 @@
 }  # CONFIG_AV1_ENCODER
 
 add_proto qw/unsigned int aom_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad64x64_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad64x64_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad64x32_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad64x32_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad32x64_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x64_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad32x32_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x32_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x16_avg avx2 msa sse2/;
 
 add_proto qw/unsigned int aom_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad16x32_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x32_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad16x16_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x16_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad16x8_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x8_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad8x16_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x16_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad8x8_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x8_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad8x4_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x4_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad4x8_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad4x8_avg msa sse2/;
 
 add_proto qw/unsigned int aom_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad4x4_avg msa/, "$sse2_x86inc";
+specialize qw/aom_sad4x4_avg msa sse2/;
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
@@ -1205,43 +1182,43 @@
 # Multi-block SAD, comparing a reference to N independent blocks
 #
 add_proto qw/void aom_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad64x64x4d avx2 neon msa sse2/;
 
 add_proto qw/void aom_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad64x32x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad64x32x4d msa sse2/;
 
 add_proto qw/void aom_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x64x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x64x4d msa sse2/;
 
 add_proto qw/void aom_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x32x4d avx2 neon msa sse2/;
 
 add_proto qw/void aom_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x16x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad32x16x4d msa sse2/;
 
 add_proto qw/void aom_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x32x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x32x4d msa sse2/;
 
 add_proto qw/void aom_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x16x4d neon msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x16x4d neon msa sse2/;
 
 add_proto qw/void aom_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x8x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad16x8x4d msa sse2/;
 
 add_proto qw/void aom_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x16x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x16x4d msa sse2/;
 
 add_proto qw/void aom_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x8x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x8x4d msa sse2/;
 
 add_proto qw/void aom_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x4x4d msa/, "$sse2_x86inc";
+specialize qw/aom_sad8x4x4d msa sse2/;
 
 add_proto qw/void aom_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x8x4d msa/, "$sse_x86inc";
+specialize qw/aom_sad4x8x4d msa sse2/;
 
 add_proto qw/void aom_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x4x4d msa/, "$sse_x86inc";
+specialize qw/aom_sad4x4x4d msa sse2/;
 
 #
 # Structured Similarity (SSIM)
@@ -1265,37 +1242,37 @@
   # Single block SAD
   #
   add_proto qw/unsigned int aom_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad64x64 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad64x32 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x64 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x32 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x16 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x32 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x16 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x8 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x16 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x8 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x4 sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/aom_highbd_sad4x8/;
@@ -1314,37 +1291,37 @@
   specialize qw/aom_highbd_minmax_8x8/;
 
   add_proto qw/unsigned int aom_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad64x64_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad64x64_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad64x32_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad64x32_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad32x64_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x64_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad32x32_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x32_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad32x16_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x16_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad16x32_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x32_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad16x16_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x16_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad16x8_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x8_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad8x16_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x16_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad8x8_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x8_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad8x4_avg/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x4_avg sse2/;
 
   add_proto qw/unsigned int aom_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/aom_highbd_sad4x8_avg/;
@@ -1409,43 +1386,43 @@
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   add_proto qw/void aom_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad64x64x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad64x64x4d sse2/;
 
   add_proto qw/void aom_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad64x32x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad64x32x4d sse2/;
 
   add_proto qw/void aom_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x64x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x64x4d sse2/;
 
   add_proto qw/void aom_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x32x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x32x4d sse2/;
 
   add_proto qw/void aom_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x16x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad32x16x4d sse2/;
 
   add_proto qw/void aom_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x32x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x32x4d sse2/;
 
   add_proto qw/void aom_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x16x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x16x4d sse2/;
 
   add_proto qw/void aom_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x8x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad16x8x4d sse2/;
 
   add_proto qw/void aom_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x16x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x16x4d sse2/;
 
   add_proto qw/void aom_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x8x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x8x4d sse2/;
 
   add_proto qw/void aom_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x4x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad8x4x4d sse2/;
 
   add_proto qw/void aom_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x8x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad4x8x4d sse2/;
 
   add_proto qw/void aom_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x4x4d/, "$sse2_x86inc";
+  specialize qw/aom_highbd_sad4x4x4d sse2/;
 
   #
   # Structured Similarity (SSIM)
@@ -1546,82 +1523,82 @@
 # Subpixel Variance
 #
 add_proto qw/uint32_t aom_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance16x16 media neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance8x8 media neon msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
 
 add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
 
 #
 # Specialty Subpixel
@@ -1777,217 +1754,217 @@
   # Subpixel Variance
   #
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+  specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index d2ebc34..38a6a1e 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -250,7 +250,6 @@
   return *sse;
 }
 
-#if CONFIG_USE_X86INC
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in
 // highbd_subpel_variance_impl_sse2.asm
@@ -557,7 +556,6 @@
 
 #undef FNS
 #undef FN
-#endif  // CONFIG_USE_X86INC
 
 void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height,
                                     const uint8_t *ref8, const int ref_stride) {
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index d48f3fd..f27015f 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -23,33 +23,41 @@
   movd                  m4, [ref2q+%3]
   movd                  m7, [ref3q+%3]
   movd                  m5, [ref4q+%3]
-  punpckldq             m0, [srcq +%4]
-  punpckldq             m6, [ref1q+%5]
-  punpckldq             m4, [ref2q+%5]
-  punpckldq             m7, [ref3q+%5]
-  punpckldq             m5, [ref4q+%5]
+  movd                  m1, [srcq +%4]
+  movd                  m2, [ref1q+%5]
+  punpckldq             m0, m1
+  punpckldq             m6, m2
+  movd                  m1, [ref2q+%5]
+  movd                  m2, [ref3q+%5]
+  movd                  m3, [ref4q+%5]
+  punpckldq             m4, m1
+  punpckldq             m7, m2
+  punpckldq             m5, m3
+  movlhps               m0, m0
+  movlhps               m6, m4
+  movlhps               m7, m5
   psadbw                m6, m0
-  psadbw                m4, m0
   psadbw                m7, m0
-  psadbw                m5, m0
-  punpckldq             m6, m4
-  punpckldq             m7, m5
 %else
   movd                  m1, [ref1q+%3]
+  movd                  m5, [ref1q+%5]
   movd                  m2, [ref2q+%3]
+  movd                  m4, [ref2q+%5]
+  punpckldq             m1, m5
+  punpckldq             m2, m4
   movd                  m3, [ref3q+%3]
+  movd                  m5, [ref3q+%5]
+  punpckldq             m3, m5
   movd                  m4, [ref4q+%3]
-  punpckldq             m0, [srcq +%4]
-  punpckldq             m1, [ref1q+%5]
-  punpckldq             m2, [ref2q+%5]
-  punpckldq             m3, [ref3q+%5]
-  punpckldq             m4, [ref4q+%5]
+  movd                  m5, [ref4q+%5]
+  punpckldq             m4, m5
+  movd                  m5, [srcq +%4]
+  punpckldq             m0, m5
+  movlhps               m0, m0
+  movlhps               m1, m2
+  movlhps               m3, m4
   psadbw                m1, m0
-  psadbw                m2, m0
   psadbw                m3, m0
-  psadbw                m4, m0
-  punpckldq             m1, m2
-  punpckldq             m3, m4
   paddd                 m6, m1
   paddd                 m7, m3
 %endif
@@ -170,10 +178,16 @@
   PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
 %endmacro
 
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_128x2x4 5-6 0
+  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
+  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
+%endmacro
+
 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
 %macro SADNXN4D 2
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
@@ -195,7 +209,7 @@
 %endrep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
-%if mmsize == 16
+%if %1 > 4
   pslldq                m5, 4
   pslldq                m7, 4
   por                   m4, m5
@@ -210,8 +224,10 @@
   RET
 %else
   movifnidn             r4, r4mp
-  movq               [r4+0], m6
-  movq               [r4+8], m7
+  pshufd            m6, m6, 0x08
+  pshufd            m7, m7, 0x08
+  movq              [r4+0], m6
+  movq              [r4+8], m7
   RET
 %endif
 %endmacro
@@ -228,7 +244,5 @@
 SADNXN4D  8, 16
 SADNXN4D  8,  8
 SADNXN4D  8,  4
-
-INIT_MMX sse
 SADNXN4D  4,  8
 SADNXN4D  4,  4
diff --git a/aom_dsp/x86/subpel_variance_sse2.asm b/aom_dsp/x86/subpel_variance_sse2.asm
index 0bfc63a..d3feb7e 100644
--- a/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/aom_dsp/x86/subpel_variance_sse2.asm
@@ -60,8 +60,8 @@
   paddd                %6, %1
 %endmacro
 
-%macro STORE_AND_RET 0
-%if mmsize == 16
+%macro STORE_AND_RET 1
+%if %1 > 4
   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
   ; We have to sign-extend it before adding the words within the register
@@ -81,16 +81,16 @@
   movd               [r1], m7           ; store sse
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
-%else ; mmsize == 8
-  pshufw               m4, m6, 0xe
-  pshufw               m3, m7, 0xe
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
   paddw                m6, m4
   paddd                m7, m3
   pcmpgtw              m5, m6           ; mask for 0 > x
   mov                  r1, ssem         ; r1 = unsigned int *sse
   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   movd               [r1], m7           ; store sse
-  pshufw               m4, m6, 0xe
+  pshuflw              m4, m6, 0xe
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %endif
@@ -199,6 +199,12 @@
   %endif
 %endif
 
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -231,6 +237,7 @@
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+
 %if %2 == 0 ; !avg
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
@@ -240,24 +247,37 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
+  movx                 m0, [srcq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
 %endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
+  movx                 m2, [srcq+src_strideq]
 %endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
 %if %2 == 1 ; avg
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -274,10 +294,10 @@
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_zero_y_nonhalf
 
   ; x_offset == 0 && y_offset == 0.5
@@ -299,37 +319,41 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq*2]
-%else ; mmsize == 8
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
-%else
-  punpckldq            m2, [srcq+src_strideq*2]
 %endif
-%endif
-  movh                 m1, [dstq]
-%if mmsize == 16
+  movx                 m1, [dstq]
+%if %1 > 4
   movlhps              m0, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
 %endif
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m1, [dstq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -346,7 +370,7 @@
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
@@ -354,7 +378,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -427,12 +451,12 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -452,17 +476,27 @@
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -478,10 +512,10 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonzero:
-  cmp           x_offsetd, 8
+  cmp           x_offsetd, 4
   jne .x_nonhalf
   ; x_offset == 0.5
   test          y_offsetd, y_offsetd
@@ -506,30 +540,40 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m4, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
   movhps               m4, [srcq+src_strideq+1]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
-  punpckldq            m4, [srcq+src_strideq+1]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
 %endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
-  movh                 m1, [dstq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
   pavgb                m0, m4
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -546,10 +590,10 @@
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_half_y_nonhalf
 
   ; x_offset == 0.5 && y_offset == 0.5
@@ -581,53 +625,58 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq]
   punpckldq            m2, m1
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m1, [srcq+src_strideq+1]
   punpckldq            m3, m1
-%else
-  punpckldq            m2, [srcq+src_strideq]
-  punpckldq            m3, [srcq+src_strideq+1]
-%endif
 %endif
   pavgb                m2, m3
-%if mmsize == 16
+%if %1 > 4
   movlhps              m0, m2
   movhlps              m4, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
-  pshufw               m4, m2, 0xe
+  pshuflw              m4, m2, 0xe
 %endif
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq]
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
   pavgb                m2, m3
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
@@ -644,7 +693,7 @@
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
@@ -652,7 +701,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -727,23 +776,23 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 %if notcpuflag(ssse3)
   punpcklbw            m0, m5
 %endif
 .x_half_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -763,16 +812,26 @@
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -789,7 +848,7 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf:
   test          y_offsetd, y_offsetd
@@ -800,7 +859,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -868,14 +927,14 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
@@ -895,17 +954,27 @@
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -921,10 +990,10 @@
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
@@ -932,7 +1001,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1040,8 +1109,8 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1057,17 +1126,17 @@
   add                srcq, src_strideq
   psraw                m0, 4
 .x_other_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
@@ -1082,9 +1151,9 @@
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
@@ -1092,10 +1161,20 @@
   pavgw                m2, m4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
@@ -1113,7 +1192,7 @@
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
 %ifdef PIC
@@ -1121,7 +1200,7 @@
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1264,8 +1343,8 @@
   INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1286,20 +1365,20 @@
   INC_SRC_BY_SRC_STRIDE
 
 .x_other_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
 
   INC_SRC_BY_SRC_STRIDE
-  movh                 m4, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
 
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m3, [dstq+dst_strideq]
-  movh                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
@@ -1338,9 +1417,9 @@
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
@@ -1348,10 +1427,20 @@
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
@@ -1369,7 +1458,8 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+%undef movx
+  STORE_AND_RET %1
 %endmacro
 
 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1378,26 +1468,22 @@
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4
 INIT_XMM sse2
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4, 1
 INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4, 1
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 949d585..75e9719 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -335,7 +335,6 @@
   return *sse;
 }
 
-#if CONFIG_USE_X86INC
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
 #define DECL(w, opt)                                                           \
@@ -344,11 +343,11 @@
       const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
       void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt2);          \
+  DECL(4, opt1);          \
   DECL(8, opt1);          \
   DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
@@ -397,10 +396,10 @@
   FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
   FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
   FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt2, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt2, (int32_t), (int32_t))
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse);
+FNS(sse2, sse2);
 FNS(ssse3, ssse3);
 
 #undef FNS
@@ -414,11 +413,11 @@
       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
       void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt2);          \
+  DECL(4, opt1);          \
   DECL(8, opt1);          \
   DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
@@ -468,15 +467,14 @@
   FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
   FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
   FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt2, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt2, (uint32_t), (int32_t))
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+  FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
 
 #undef FNS
 #undef FN
-#endif  // CONFIG_USE_X86INC
 
 void aom_upsampled_pred_sse2(uint8_t *pred, int width, int height,
                              const uint8_t *ref, const int ref_stride) {
@@ -683,7 +681,6 @@
         p0 = _mm_packus_epi16(p0, zero);
 
         *(int *)comp_pred = _mm_cvtsi128_si32(p0);
-
         comp_pred += 4;
         pred += 4;
         ref += 4 * 8;
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 176643a..ddc2422 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -677,18 +677,8 @@
 
 //------------------------------------------------------------------------------
 // x86 functions
-#if HAVE_SSE
-#if CONFIG_USE_X86INC
-const SadMxNx4Param x4d_sse_tests[] = {
-  make_tuple(4, 8, &aom_sad4x8x4d_sse, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::ValuesIn(x4d_sse_tests));
-#endif  // CONFIG_USE_X86INC
-#endif  // HAVE_SSE
 
 #if HAVE_SSE2
-#if CONFIG_USE_X86INC
 const SadMxNParam sse2_tests[] = {
   make_tuple(64, 64, &aom_sad64x64_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32_sse2, -1),
@@ -805,6 +795,8 @@
   make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1),
   make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1),
   make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
 #if CONFIG_AOM_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
@@ -848,7 +840,6 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
-#endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 844a1d0..081c85a 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -987,8 +987,8 @@
                       make_tuple(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
                       make_tuple(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
                       make_tuple(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &aom_sub_pixel_variance4x8_sse, 0),
-                      make_tuple(2, 2, &aom_sub_pixel_variance4x4_sse, 0)));
+                      make_tuple(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &aom_sub_pixel_variance4x4_sse2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxSubpelAvgVarianceTest,
@@ -1004,8 +1004,8 @@
         make_tuple(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
         make_tuple(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
         make_tuple(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &aom_sub_pixel_avg_variance4x8_sse, 0),
-        make_tuple(2, 2, &aom_sub_pixel_avg_variance4x4_sse, 0)));
+        make_tuple(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
+        make_tuple(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0)));
 #endif  // CONFIG_USE_X86INC
 
 #if CONFIG_AOM_HIGHBITDEPTH