SSE4.1 and AVX2 highbd_blend_a64_d16_mask Add SSE4.1 and AVX2 SIMD optimised implementations of highbd_blend_a64_d16_mask. Unit tests speed-ups is approximately 2x to 2.2x for SSE4.1 and 2.8x to 3.6x for AVX2. On 15 frames of crowd_run_360 at 10/12bd cpu=0 speed-up is about 3-4%. At cpu=1 speed-up is about 3%. I also corrected a comment at the top of blend_a64_mask.c which had not been updated to reflect the rename of d32 blend functions to d16. Change-Id: I789932a792c78e1bc64eabbf7143ecb0e67532da

commit: b5ed1e629ff9a267b81c1137f5a0c23de05114f6 [log] [tgz]
author: David Turner <david.turner@argondesign.com> Thu Oct 11 15:17:53 2018 +0100
committer: Debargha Mukherjee <debargha@google.com> Mon Oct 22 18:47:09 2018 +0000
tree: f6458ab62e23b7f3c8bbfe8d5a7cadd380d1c059
parent: 63adafd2501bb0b7c10fed44c01ca4c767aec834 [diff] [blame]
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 66ca6fc..7592533 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc

@@ -86,6 +86,7 @@
     w_ = block_size_wide[block_size];
     h_ = block_size_high[block_size];
     run_times = run_times > 1 ? run_times / w_ : 1;
+    ASSERT_GT(run_times, 0);
     subx_ = subx;
     suby_ = suby;
 
@@ -248,13 +249,13 @@
 INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
                         ::testing::Values(TestFuncs(
                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
-#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
                         ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
                                                     aom_blend_a64_mask_avx2)));
-#endif  // HAVE_SSE4_1
+#endif  // HAVE_AVX2
 
 //////////////////////////////////////////////////////////////////////////////
 // 8 bit _d16 version
@@ -482,6 +483,7 @@
   static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
 
   void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
     ConvolveParams conv_params;
     conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
     conv_params.round_1 = COMPOUND_ROUND1_BITS;
@@ -566,11 +568,45 @@
     }
   }
 }
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = rng_.Rand12() % (1 << bit_depth_);
+        dst_tst_[i] = rng_.Rand12() % (1 << bit_depth_);
+
+        src0_[i] = rng_.Rand16();
+        src1_[i] = rng_.Rand16();
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+      RunOneTest(bsize, 1, 1, kRunTimes);
+      RunOneTest(bsize, 0, 0, kRunTimes);
+    }
+  }
+}
 
 INSTANTIATE_TEST_CASE_P(
     C, BlendA64MaskTestHBD_d16,
     ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_avx2)));
+#endif  // HAVE_AVX2
+
 // TODO(slavarnway): Enable the following in the avx2 commit. (56501)
 #if 0
 #if HAVE_AVX2
commit	b5ed1e629ff9a267b81c1137f5a0c23de05114f6	[log] [tgz]
author	David Turner <david.turner@argondesign.com>	Thu Oct 11 15:17:53 2018 +0100
committer	Debargha Mukherjee <debargha@google.com>	Mon Oct 22 18:47:09 2018 +0000
tree	f6458ab62e23b7f3c8bbfe8d5a7cadd380d1c059
parent	63adafd2501bb0b7c10fed44c01ca4c767aec834 [diff] [blame]