[CFL] NEON Version of luma_subsampling_420_lbd Includes unit tests for conformance and speed. NEON/CFLSubsampleTest (Odroid C2, Cortex-A53) 4x4: C time = 1003 us, SIMD time = 811 us (~1.2x) 8x8: C time = 3042 us, SIMD time = 1174 us (~2.6x) 16x16: C time = 11792 us, SIMD time = 2389 us (~4.9x) 32x32: C time = 45370 us, SIMD time = 9224 us (~4.9x) Change-Id: I717714e942680ca51a0544f32e2548386290ce32
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c index cf97e8d..beaaa13 100644 --- a/av1/common/arm/cfl_neon.c +++ b/av1/common/arm/cfl_neon.c
@@ -22,6 +22,49 @@ return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset)); } +// Load half of a vector and duplicated in other half +static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) { + return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr)); +} + +// Store half of a vector. +static INLINE void vsth_s16(int16_t *ptr, int16x4_t val) { + *((uint32_t *)ptr) = vreinterpret_u32_s16(val)[0]; +} + +static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, + int input_stride, + int16_t *pred_buf_q3, int width, + int height) { + const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride)); + vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride)); + vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1)); + } else { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride)); + vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(sum), 1)); + if (width == 32) { + const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16)); + const uint16x8_t next_sum = + vpadalq_u8(next_top, vld1q_u8(input + 16 + input_stride)); + vst1q_s16(pred_buf_q3 + 8, + vshlq_n_s16(vreinterpretq_s16_u16(next_sum), 1)); + } + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +CFL_GET_SUBSAMPLE_FUNCTION(neon) + static INLINE void subtract_average_neon(int16_t *pred_buf, int width, int height, int round_offset, const int num_pel_log2) {
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 8017b40..834a1c2 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -564,7 +564,7 @@ specialize qw/get_subtract_average_fn sse2 avx2 neon/; add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size"; -specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2/; +specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/; add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size"; specialize qw/get_predict_lbd_fn ssse3 avx2/;
diff --git a/test/cfl_test.cc b/test/cfl_test.cc index 98f4ec2..658d50a 100644 --- a/test/cfl_test.cc +++ b/test/cfl_test.cc
@@ -480,8 +480,14 @@ const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES( get_subtract_average_fn_neon) }; +const subsample_param subsample_sizes_neon[] = { ALL_CFL_TX_SIZES( + cfl_get_luma_subsampling_420_lbd_neon) }; + INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest, ::testing::ValuesIn(sub_avg_sizes_neon)); +INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleTest, + ::testing::ValuesIn(subsample_sizes_neon)); + #endif } // namespace