[CFL] Neon Version of 4:4:4 HBD Subsampling
Includes unit tests for conformance and speed.
NEON/CFLSubsampleHBD444SpeedTest
4x4: C time = 1515 us, SIMD time = 513 us (~3x)
8x8: C time = 5569 us, SIMD time = 1302 us (~4.3x)
16x16: C time = 22337 us, SIMD time = 4203 us (~5.3x)
32x32: C time = 87936 us, SIMD time = 17046 us (~5.2x)
Change-Id: I708147da051ebcd28c51cceaba1017a658911c88
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index a759b9d..e381448 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -230,6 +230,37 @@
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+ int input_stride,
+ int16_t *pred_buf_q3, int width,
+ int height) {
+ const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const int16x4_t top = vreinterpret_s16_u16(vld1_u16(input));
+ vst1_s16(pred_buf_q3, vshl_n_s16(top, 3));
+ } else if (width == 8) {
+ const int16x8_t top = vreinterpretq_s16_u16(vld1q_u16(input));
+ vst1q_s16(pred_buf_q3, vshlq_n_s16(top, 3));
+ } else if (width == 16) {
+ const uint16x8x2_t top = vld2q_u16(input);
+ int16x8x2_t results;
+ results.val[0] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[0]), 3);
+ results.val[1] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[1]), 3);
+ vst2q_s16(pred_buf_q3, results);
+ } else {
+ const uint16x8x4_t top = vld4q_u16(input);
+ int16x8x4_t results;
+ results.val[0] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[0]), 3);
+ results.val[1] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[1]), 3);
+ results.val[2] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[2]), 3);
+ results.val[3] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[3]), 3);
+ vst4q_s16(pred_buf_q3, results);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
CFL_GET_SUBSAMPLE_FUNCTION(neon)
static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 6a7440d..e9c605b 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -362,7 +362,7 @@
specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2/;
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index a47be66..3b15e86 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -335,10 +335,6 @@
}
}
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when HBD 444 SIMD
-// will be implemented
-CFL_SUBSAMPLE_FUNCTIONS(c, 444, hbd)
-
CFL_GET_SUBSAMPLE_FUNCTION(c)
static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
@@ -349,8 +345,7 @@
}
return cfl_get_luma_subsampling_422_hbd(tx_size);
}
- // TODO(ltrudeau) Remove _c when HBD 444 SIMD is added
- return cfl_get_luma_subsampling_444_hbd_c(tx_size);
+ return cfl_get_luma_subsampling_444_hbd(tx_size);
}
static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 766515b..969cd31 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -156,7 +156,8 @@
CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
- CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd)
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
// Null function used for invalid tx_sizes
static INLINE void cfl_subtract_average_null(int16_t *pred_buf_q3) {
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index aa68f78..1e869cf 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -297,7 +297,6 @@
} while (pred_buf_q3 < end);
}
-CFL_SUBSAMPLE_FUNCTIONS(ssse3, 444, hbd)
CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 91af71f..020c124 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -529,11 +529,9 @@
};
const subsample_hbd_param subsample_hbd_sizes_neon[] = {
- ALL_CFL_TX_SIZES_SUBSAMPLE(
- cfl_get_luma_subsampling_420_hbd_neon,
- cfl_get_luma_subsampling_422_hbd_neon,
- cfl_get_luma_subsampling_444_hbd_c) // TODO(ltrudeau) replace with
- // 444 when SIMD is available
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
+ cfl_get_luma_subsampling_422_hbd_neon,
+ cfl_get_luma_subsampling_444_hbd_neon)
};
const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(