SSE4.1 and AVX2 highbd_blend_a64_d16_mask

Add SSE4.1 and AVX2 SIMD optimised implementations of
highbd_blend_a64_d16_mask.

Unit tests speed-ups is approximately 2x to 2.2x for SSE4.1 and 2.8x to
3.6x for AVX2.

On 15 frames of crowd_run_360 at 10/12bd cpu=0 speed-up is about 3-4%.
At cpu=1 speed-up is about 3%.

I also corrected a comment at the top of blend_a64_mask.c which had not
been updated to reflect the rename of d32 blend functions to d16.

Change-Id: I789932a792c78e1bc64eabbf7143ecb0e67532da
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 66ca6fc..7592533 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -86,6 +86,7 @@
     w_ = block_size_wide[block_size];
     h_ = block_size_high[block_size];
     run_times = run_times > 1 ? run_times / w_ : 1;
+    ASSERT_GT(run_times, 0);
     subx_ = subx;
     suby_ = suby;
 
@@ -248,13 +249,13 @@
 INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
                         ::testing::Values(TestFuncs(
                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
-#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
                         ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
                                                     aom_blend_a64_mask_avx2)));
-#endif  // HAVE_SSE4_1
+#endif  // HAVE_AVX2
 
 //////////////////////////////////////////////////////////////////////////////
 // 8 bit _d16 version
@@ -482,6 +483,7 @@
   static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
 
   void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
     ConvolveParams conv_params;
     conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
     conv_params.round_1 = COMPOUND_ROUND1_BITS;
@@ -566,11 +568,45 @@
     }
   }
 }
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = rng_.Rand12() % (1 << bit_depth_);
+        dst_tst_[i] = rng_.Rand12() % (1 << bit_depth_);
+
+        src0_[i] = rng_.Rand16();
+        src1_[i] = rng_.Rand16();
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+      RunOneTest(bsize, 1, 1, kRunTimes);
+      RunOneTest(bsize, 0, 0, kRunTimes);
+    }
+  }
+}
 
 INSTANTIATE_TEST_CASE_P(
     C, BlendA64MaskTestHBD_d16,
     ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_avx2)));
+#endif  // HAVE_AVX2
+
 // TODO(slavarnway): Enable the following in the avx2 commit. (56501)
 #if 0
 #if HAVE_AVX2