[CFL] SSSE3 Versions of luma_subsampling_420_hbd
We include unit tests for conformance and speed.
SSSE3/CFLSubsampleHBDSpeedTest (i7-7820X)
4x4: C time = 96 us, SIMD time = 56 us (~1.7x)
8x8: C time = 319 us, SIMD time = 96 us (~3.3x)
16x16: C time = 1637 us, SIMD time = 243 us (~6.7x)
32x32: C time = 8057 us, SIMD time = 890 us (~9.1x)
Change-Id: Ie49c133e45c3a5d216519d46e27eb7ce4553faa7
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 564d62f..63b4c6c 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -355,6 +355,9 @@
add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_hbd ssse3/;
+
add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;