[CFL] SSSE3 Version of 4:4:4 HBD Subsampling Includes unit tests for conformance and speed. SSSE3/CFLSubsampleHBD444SpeedTest 4x4: C time = 155 us, SIMD time = 49 us (~3.2x) 8x8: C time = 522 us, SIMD time = 80 us (~6.5x) 16x16: C time = 2067 us, SIMD time = 286 us (~7.2x) 32x32: C time = 7045 us, SIMD time = 1044 us (~6.7x) Change-Id: I0979ae2284765954b45fe9bb16ee618db1c4b36e
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 554598d..60d8ead 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -361,6 +361,9 @@ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size"; specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/; +add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size"; +specialize qw/cfl_get_luma_subsampling_444_hbd ssse3/; + add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size"; specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c index 250375f..0fd15d1 100644 --- a/av1/common/x86/cfl_ssse3.c +++ b/av1/common/x86/cfl_ssse3.c
@@ -266,6 +266,48 @@ } while (pred_buf_m128i < end); } +static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, + int input_stride, + int16_t *pred_buf_q3, + int width, int height) { + const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + + __m128i row, row_1, row_2, row_3; + do { + if (width == 4) { + row = _mm_loadl_epi64((__m128i *)input); + } else { + row = _mm_loadu_si128((__m128i *)input); + if (width >= 16) { + row_1 = _mm_loadu_si128((__m128i *)(input + 8)); + row_1 = _mm_slli_epi16(row_1, 3); + } + if (width == 32) { + row_2 = _mm_loadu_si128((__m128i *)(input + 16)); + row_2 = _mm_slli_epi16(row_2, 3); + row_3 = _mm_loadu_si128((__m128i *)(input + 24)); + row_3 = _mm_slli_epi16(row_3, 3); + } + } + row = _mm_slli_epi16(row, 3); + + if (width == 4) { + _mm_storel_epi64((__m128i *)pred_buf_q3, row); + } else { + _mm_storeu_si128((__m128i *)pred_buf_q3, row); + if (width >= 16) { + _mm_storeu_si128((__m128i *)(pred_buf_q3 + 8), row_1); + } + if (width == 32) { + _mm_storeu_si128((__m128i *)(pred_buf_q3 + 16), row_2); + _mm_storeu_si128((__m128i *)(pred_buf_q3 + 24), row_3); + } + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +CFL_SUBSAMPLE_FUNCTIONS(ssse3, 444, hbd) CFL_GET_SUBSAMPLE_FUNCTION(ssse3) static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
diff --git a/test/cfl_test.cc b/test/cfl_test.cc index 33bced7..d00ae76 100644 --- a/test/cfl_test.cc +++ b/test/cfl_test.cc
@@ -315,8 +315,7 @@ CFLSubsampleTest::SetUp(); fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size); fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size); - // TODO(ltrudeau) Replace with 444 when SIMD is available - fun_444_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size); + fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size); } }; @@ -337,6 +336,14 @@ subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12); } +TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) { + subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12); +} + +TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) { + subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12); +} + typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size); typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param; class CFLPredictTest : public ::testing::TestWithParam<predict_param>, @@ -448,11 +455,9 @@ }; const subsample_hbd_param subsample_hbd_sizes_ssse3[] = { - ALL_CFL_TX_SIZES_SUBSAMPLE( - cfl_get_luma_subsampling_420_hbd_ssse3, - cfl_get_luma_subsampling_422_hbd_ssse3, - cfl_get_luma_subsampling_420_hbd_ssse3) // TODO(ltrudeau) replace with - // 444 when SIMD is available + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3, + cfl_get_luma_subsampling_422_hbd_ssse3, + cfl_get_luma_subsampling_444_hbd_ssse3) }; const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES( @@ -488,8 +493,8 @@ ALL_CFL_TX_SIZES_SUBSAMPLE( cfl_get_luma_subsampling_420_hbd_avx2, cfl_get_luma_subsampling_422_hbd_avx2, - cfl_get_luma_subsampling_420_hbd_avx2) // TODO(ltrudeau) replace with - // 444 when SIMD is available + cfl_get_luma_subsampling_444_hbd_c) // TODO(ltrudeau) replace with + // 444 when SIMD is available }; const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES( @@ -529,8 +534,8 @@ ALL_CFL_TX_SIZES_SUBSAMPLE( cfl_get_luma_subsampling_420_hbd_neon, cfl_get_luma_subsampling_422_hbd_neon, - cfl_get_luma_subsampling_420_hbd_neon) // TODO(ltrudeau) replace with - // 444 when SIMD is available + cfl_get_luma_subsampling_444_hbd_c) // TODO(ltrudeau) replace with + // 444 when SIMD is available }; const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(