Add aom_comp_mask_pred_avx2

1. Add AVX2 implementation of aom_comp_mask_pred.
2. For width 8 still use ssse3 version.
3. For other widths(16,32), AVX2 version is 1.2x-2.0x faster
than ssse3 version

Change-Id: I80acc1be54ab21a52f7847e91b1299853add757c
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 699f8b4..f61af74 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -392,6 +392,7 @@
       set(AOM_DSP_ENCODER_INTRIN_SSSE3
           ${AOM_DSP_ENCODER_INTRIN_SSSE3}
           "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+          "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
           "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
 
       set(AOM_DSP_ENCODER_INTRIN_SSE2