SSE4.1 and AVX2 highbd_blend_a64_d16_mask
Add SSE4.1 and AVX2 SIMD optimised implementations of
highbd_blend_a64_d16_mask.
Unit tests speed-ups is approximately 2x to 2.2x for SSE4.1 and 2.8x to
3.6x for AVX2.
On 15 frames of crowd_run_360 at 10/12bd cpu=0 speed-up is about 3-4%.
At cpu=1 speed-up is about 3%.
I also corrected a comment at the top of blend_a64_mask.c which had not
been updated to reflect the rename of d32 blend functions to d16.
Change-Id: I789932a792c78e1bc64eabbf7143ecb0e67532da
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 66ca6fc..7592533 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -86,6 +86,7 @@
w_ = block_size_wide[block_size];
h_ = block_size_high[block_size];
run_times = run_times > 1 ? run_times / w_ : 1;
+ ASSERT_GT(run_times, 0);
subx_ = subx;
suby_ = suby;
@@ -248,13 +249,13 @@
INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
::testing::Values(TestFuncs(
aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
-#endif // HAVE_AVX2
+#endif // HAVE_SSE4_1
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
aom_blend_a64_mask_avx2)));
-#endif // HAVE_SSE4_1
+#endif // HAVE_AVX2
//////////////////////////////////////////////////////////////////////////////
// 8 bit _d16 version
@@ -482,6 +483,7 @@
static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+ ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
ConvolveParams conv_params;
conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
conv_params.round_1 = COMPOUND_ROUND1_BITS;
@@ -566,11 +568,45 @@
}
}
}
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
+ const int kRunTimes = 10000000;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand12() % (1 << bit_depth_);
+ dst_tst_[i] = rng_.Rand12() % (1 << bit_depth_);
+
+ src0_[i] = rng_.Rand16();
+ src1_[i] = rng_.Rand16();
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunOneTest(bsize, 1, 1, kRunTimes);
+ RunOneTest(bsize, 0, 0, kRunTimes);
+ }
+ }
+}
INSTANTIATE_TEST_CASE_P(
C, BlendA64MaskTestHBD_d16,
::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_avx2)));
+#endif // HAVE_AVX2
+
// TODO(slavarnway): Enable the following in the avx2 commit. (56501)
#if 0
#if HAVE_AVX2