[CFL] SSSE3/AVX2 versions of cfl_build_prediction_lbd
Includes unit tests for conformance and speed.
SSSE3/CFLPredictTest:
4x4: C time = 2063 us, SIMD time = 313 us (~6.6x)
8x8: C time = 6656 us, SIMD time = 493 us (~14x)
16x16: C time = 24970 us, SIMD time = 1327 us (~19x)
32x32: C time = 59020 us, SIMD time = 5178 us (~11x)
AVX2/CFLPredictTest:
4x4: C time = 2052 us, SIMD time = 333 us (~6.2x)
8x8: C time = 6712 us, SIMD time = 513 us (~13x)
16x16: C time = 25292 us, SIMD time = 1023 us (~25x)
32x32: C time = 58994 us, SIMD time = 2828 us (~21x)
Change-Id: I08690a548be981ff10e184de468b9e0e691ee812
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 40fbe72..7a04952 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -35,16 +35,27 @@
make_tuple(16, 8, &function), make_tuple(8, 16, &function), \
make_tuple(16, 16, &function)
+#define ALL_CFL_TX_SIZES(function) \
+ make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function), \
+ make_tuple(TX_8X4, &function), make_tuple(TX_8X8, &function), \
+ make_tuple(TX_8X16, &function), make_tuple(TX_16X8, &function), \
+ make_tuple(TX_16X16, &function), make_tuple(TX_16X32, &function), \
+ make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
+
namespace {
typedef void (*subtract_fn)(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3);
typedef cfl_subsample_lbd_fn (*get_subsample_fn)(int width, int height);
+typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
+
typedef std::tr1::tuple<int, int, subtract_fn> subtract_param;
typedef std::tr1::tuple<int, int, get_subsample_fn> subsample_param;
+typedef std::tr1::tuple<TX_SIZE, get_predict_fn> predict_param;
+
static void assertFaster(int ref_elapsed_time, int elapsed_time) {
EXPECT_GT(ref_elapsed_time, elapsed_time)
<< "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
@@ -109,6 +120,37 @@
}
};
+class CFLPredictTest : public ::testing::TestWithParam<predict_param> {
+ public:
+ virtual ~CFLPredictTest() {}
+ virtual void SetUp() { predict = GET_PARAM(1); }
+
+ protected:
+ int Width() const { return tx_size_wide[GET_PARAM(0)]; }
+ int Height() const { return tx_size_high[GET_PARAM(0)]; }
+ TX_SIZE Tx_size() const { return GET_PARAM(0); }
+ DECLARE_ALIGNED(32, uint8_t, chroma_pels_ref[CFL_BUF_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, sub_luma_pels_ref[CFL_BUF_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, chroma_pels[CFL_BUF_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, sub_luma_pels[CFL_BUF_SQUARE]);
+ get_predict_fn predict;
+ int alpha_q3;
+ uint8_t dc;
+ void init(int width, int height) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ alpha_q3 = rnd(33) - 16;
+ dc = rnd.Rand8();
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ chroma_pels[j * CFL_BUF_LINE + i] = dc;
+ chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
+ sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
+ sub_luma_pels[j * CFL_BUF_LINE + i] = rnd.Rand8() - 128;
+ }
+ }
+ }
+};
+
TEST_P(CFLSubtractTest, SubtractTest) {
const int width = Width();
const int height = Height();
@@ -203,6 +245,57 @@
assertFaster(ref_elapsed_time, elapsed_time);
}
+TEST_P(CFLPredictTest, PredictTest) {
+ const int width = Width();
+ const int height = Height();
+ const TX_SIZE tx_size = Tx_size();
+
+ for (int it = 0; it < NUM_ITERATIONS; it++) {
+ init(width, height);
+ predict(tx_size)(sub_luma_pels, chroma_pels, CFL_BUF_LINE, tx_size,
+ alpha_q3);
+ get_predict_lbd_fn_c(tx_size)(sub_luma_pels_ref, chroma_pels_ref,
+ CFL_BUF_LINE, tx_size, alpha_q3);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ ASSERT_EQ(chroma_pels_ref[j * CFL_BUF_LINE + i],
+ chroma_pels[j * CFL_BUF_LINE + i]);
+ }
+ }
+ }
+}
+
+TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
+ const int width = Width();
+ const int height = Height();
+ const TX_SIZE tx_size = Tx_size();
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+
+ init(width, height);
+ cfl_predict_lbd_fn predict_impl = get_predict_lbd_fn_c(tx_size);
+ aom_usec_timer_start(&ref_timer);
+
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ predict_impl(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, tx_size,
+ alpha_q3);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+ predict_impl = predict(tx_size);
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ predict_impl(sub_luma_pels, chroma_pels, CFL_BUF_LINE, tx_size, alpha_q3);
+ }
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+ printSpeed(ref_elapsed_time, elapsed_time, width, height);
+ assertFaster(ref_elapsed_time, elapsed_time);
+}
+
#if HAVE_SSE2
const subtract_param subtract_sizes_sse2[] = { ALL_CFL_SIZES(
av1_cfl_subtract_sse2) };
@@ -216,9 +309,16 @@
const subsample_param subsample_sizes_ssse3[] = { CHROMA_420_CFL_SIZES(
get_subsample_lbd_fn_ssse3) };
+const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
+ get_predict_lbd_fn_ssse3) };
+
INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleTest,
::testing::ValuesIn(subsample_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictTest,
+ ::testing::ValuesIn(predict_sizes_ssse3));
#endif
+
#if HAVE_AVX2
const subtract_param subtract_sizes_avx2[] = { ALL_CFL_SIZES(
av1_cfl_subtract_avx2) };
@@ -226,10 +326,16 @@
const subsample_param subsample_sizes_avx2[] = { CHROMA_420_CFL_SIZES(
get_subsample_lbd_fn_avx2) };
+const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+ get_predict_lbd_fn_avx2) };
+
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubtractTest,
::testing::ValuesIn(subtract_sizes_avx2));
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleTest,
::testing::ValuesIn(subsample_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictTest,
+ ::testing::ValuesIn(predict_sizes_avx2));
#endif
} // namespace