Opt blend_a64_mask

1. optimize aom_lowbd_blend_a64_d16_mask_sse4_1
2. add aom_lowbd_blend_a64_d16_mask_avx2

Speed up about 0.5% without rd change

test sequence: BasketballDrill_832x480_50.y4m

test command line:./aomenc --cpu-used=1 --psnr -D \
 -q --end-usage=vbr --target-bitrate=800 --limit=20 \
 BasketballDrill_832x480_50.y4m -otest.webm

Change-Id: I3f909105fcdfe7921eff2d7c47ecdc09a2179253
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index c9c6795..4d4f070 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -270,6 +270,13 @@
                                     aom_lowbd_blend_a64_d16_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_avx2)));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, BlendA64MaskTest8B_d16,