Opt blend_a64_mask

1. optimize aom_lowbd_blend_a64_d16_mask_sse4_1
2. add aom_lowbd_blend_a64_d16_mask_avx2

Speed up about 0.5% without rd change

test sequence: BasketballDrill_832x480_50.y4m

test command line:./aomenc --cpu-used=1 --psnr -D \
 -q --end-usage=vbr --target-bitrate=800 --limit=20 \
 BasketballDrill_832x480_50.y4m -otest.webm

Change-Id: I3f909105fcdfe7921eff2d7c47ecdc09a2179253
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 7c0111a..2d9e775 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -76,6 +76,7 @@
             "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
@@ -88,7 +89,8 @@
             "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"