Add highbd_dc_128_predictor Neon implementation and tests Add Neon implementations of highbd_dc_128_predictor for all block sizes. Also add the corresponding tests and benchmarks. We also take this opportunity to clean up the ordering of the specialize calls in aom_dsp_rtcd_defs.pl and add the missing dc_128 cases for Neon. Change-Id: I97de6e91a76c9cdda740563975424d514623eade
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index e05c276..16bb4a9 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -341,36 +341,47 @@ specialize qw/aom_highbd_h_predictor_64x32 neon/; specialize qw/aom_highbd_h_predictor_64x64 neon/; + specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_4x16 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x32 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x4 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x64 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x8 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x64 neon/; + specialize qw/aom_highbd_dc_128_predictor_64x16 neon/; + specialize qw/aom_highbd_dc_128_predictor_64x32 neon/; + specialize qw/aom_highbd_dc_128_predictor_64x64 neon/; + specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/; - specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/; - specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/; specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/; - specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/; - specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/; specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/; - specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/; - specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/; specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/; - specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/; - specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/; specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/; - specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/; - specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/; specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/; specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/; - specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/; - specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/; specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/; - specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/; - specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/; specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/; specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/; + + specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/; specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/; - specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/; specialize qw/aom_highbd_paeth_predictor_4x4 neon/; specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c index 8363399..af212cd 100644 --- a/aom_dsp/arm/highbd_intrapred_neon.c +++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -82,6 +82,88 @@ #undef INTRA_PRED_SQUARE // ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x4_t dc) { + for (int i = 0; i < h; ++i) { + vst1_u16(dst + i * stride, dc); + } +} + +static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + } +} + +static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + vst1q_u16(dst + i * stride + 8, dc); + } +} + +static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + vst1q_u16(dst + i * stride + 8, dc); + vst1q_u16(dst + i * stride + 16, dc); + vst1q_u16(dst + i * stride + 24, dc); + } +} + +static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + vst1q_u16(dst + i * stride + 8, dc); + vst1q_u16(dst + i * stride + 16, dc); + vst1q_u16(dst + i * stride + 24, dc); + vst1q_u16(dst + i * stride + 32, dc); + vst1q_u16(dst + i * stride + 40, dc); + vst1q_u16(dst + i * stride + 48, dc); + vst1q_u16(dst + i * stride + 56, dc); + } +} + +#define HIGHBD_DC_PREDICTOR_128(w, h, q) \ + void aom_highbd_dc_128_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)above; \ + (void)bd; \ + (void)left; \ + highbd_dc_store_##w##xh(dst, stride, (h), \ + vdup##q##_n_u16(0x80 << (bd - 8))); \ + } + +HIGHBD_DC_PREDICTOR_128(4, 4, ) +HIGHBD_DC_PREDICTOR_128(4, 8, ) +HIGHBD_DC_PREDICTOR_128(4, 16, ) +HIGHBD_DC_PREDICTOR_128(8, 4, q) +HIGHBD_DC_PREDICTOR_128(8, 8, q) +HIGHBD_DC_PREDICTOR_128(8, 16, q) +HIGHBD_DC_PREDICTOR_128(8, 32, q) +HIGHBD_DC_PREDICTOR_128(16, 4, q) +HIGHBD_DC_PREDICTOR_128(16, 8, q) +HIGHBD_DC_PREDICTOR_128(16, 16, q) +HIGHBD_DC_PREDICTOR_128(16, 32, q) +HIGHBD_DC_PREDICTOR_128(16, 64, q) +HIGHBD_DC_PREDICTOR_128(32, 8, q) +HIGHBD_DC_PREDICTOR_128(32, 16, q) +HIGHBD_DC_PREDICTOR_128(32, 32, q) +HIGHBD_DC_PREDICTOR_128(32, 64, q) +HIGHBD_DC_PREDICTOR_128(64, 16, q) +HIGHBD_DC_PREDICTOR_128(64, 32, q) +HIGHBD_DC_PREDICTOR_128(64, 64, q) + +#undef HIGHBD_DC_PREDICTOR_128 + +// ----------------------------------------------------------------------------- // V_PRED #define HIGHBD_V_NXM(W, H) \
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc index 8839450..4ae0798 100644 --- a/test/intrapred_test.cc +++ b/test/intrapred_test.cc
@@ -405,9 +405,10 @@ highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8), highbd_entry(dc, 64, 64, neon, 8), - highbd_intrapred(v, neon, 12), highbd_intrapred(h, neon, 12), - highbd_intrapred(paeth, neon, 12), highbd_intrapred(smooth, neon, 12), - highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12), + highbd_intrapred(dc_128, neon, 12), highbd_intrapred(v, neon, 12), + highbd_intrapred(h, neon, 12), highbd_intrapred(paeth, neon, 12), + highbd_intrapred(smooth, neon, 12), highbd_intrapred(smooth_v, neon, 12), + highbd_intrapred(smooth_h, neon, 12), }; INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 05462a2..8812bed 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc
@@ -1306,20 +1306,23 @@ #endif #if HAVE_NEON HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon, nullptr, - nullptr, nullptr, aom_highbd_v_predictor_4x4_neon, + nullptr, aom_highbd_dc_128_predictor_4x4_neon, + aom_highbd_v_predictor_4x4_neon, aom_highbd_h_predictor_4x4_neon, aom_highbd_paeth_predictor_4x4_neon, aom_highbd_smooth_predictor_4x4_neon, aom_highbd_smooth_v_predictor_4x4_neon, aom_highbd_smooth_h_predictor_4x4_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_4x8_neon, aom_highbd_v_predictor_4x8_neon, aom_highbd_h_predictor_4x8_neon, aom_highbd_paeth_predictor_4x8_neon, aom_highbd_smooth_predictor_4x8_neon, aom_highbd_smooth_v_predictor_4x8_neon, aom_highbd_smooth_h_predictor_4x8_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_4x16_neon, aom_highbd_v_predictor_4x16_neon, aom_highbd_h_predictor_4x16_neon, aom_highbd_paeth_predictor_4x16_neon, @@ -1391,27 +1394,31 @@ #if HAVE_NEON HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon, nullptr, - nullptr, nullptr, aom_highbd_v_predictor_8x8_neon, + nullptr, aom_highbd_dc_128_predictor_8x8_neon, + aom_highbd_v_predictor_8x8_neon, aom_highbd_h_predictor_8x8_neon, aom_highbd_paeth_predictor_8x8_neon, aom_highbd_smooth_predictor_8x8_neon, aom_highbd_smooth_v_predictor_8x8_neon, aom_highbd_smooth_h_predictor_8x8_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_8x4_neon, aom_highbd_v_predictor_8x4_neon, aom_highbd_h_predictor_8x4_neon, aom_highbd_paeth_predictor_8x4_neon, aom_highbd_smooth_predictor_8x4_neon, aom_highbd_smooth_v_predictor_8x4_neon, aom_highbd_smooth_h_predictor_8x4_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_8x16_neon, aom_highbd_v_predictor_8x16_neon, aom_highbd_h_predictor_8x16_neon, aom_highbd_paeth_predictor_8x16_neon, aom_highbd_smooth_predictor_8x16_neon, aom_highbd_smooth_v_predictor_8x16_neon, aom_highbd_smooth_h_predictor_8x16_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_8x32_neon, aom_highbd_v_predictor_8x32_neon, aom_highbd_h_predictor_8x32_neon, aom_highbd_paeth_predictor_8x32_neon, @@ -1501,35 +1508,39 @@ #if HAVE_NEON HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon, - nullptr, nullptr, nullptr, + nullptr, nullptr, aom_highbd_dc_128_predictor_16x16_neon, aom_highbd_v_predictor_16x16_neon, aom_highbd_h_predictor_16x16_neon, aom_highbd_paeth_predictor_16x16_neon, aom_highbd_smooth_predictor_16x16_neon, aom_highbd_smooth_v_predictor_16x16_neon, aom_highbd_smooth_h_predictor_16x16_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_16x8_neon, aom_highbd_v_predictor_16x8_neon, aom_highbd_h_predictor_16x8_neon, aom_highbd_paeth_predictor_16x8_neon, aom_highbd_smooth_predictor_16x8_neon, aom_highbd_smooth_v_predictor_16x8_neon, aom_highbd_smooth_h_predictor_16x8_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_16x32_neon, aom_highbd_v_predictor_16x32_neon, aom_highbd_h_predictor_16x32_neon, aom_highbd_paeth_predictor_16x32_neon, aom_highbd_smooth_predictor_16x32_neon, aom_highbd_smooth_v_predictor_16x32_neon, aom_highbd_smooth_h_predictor_16x32_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_16x4_neon, aom_highbd_v_predictor_16x4_neon, aom_highbd_h_predictor_16x4_neon, aom_highbd_paeth_predictor_16x4_neon, aom_highbd_smooth_predictor_16x4_neon, aom_highbd_smooth_v_predictor_16x4_neon, aom_highbd_smooth_h_predictor_16x4_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_16x64_neon, aom_highbd_v_predictor_16x64_neon, aom_highbd_h_predictor_16x64_neon, aom_highbd_paeth_predictor_16x64_neon, @@ -1602,28 +1613,31 @@ #if HAVE_NEON HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon, - nullptr, nullptr, nullptr, + nullptr, nullptr, aom_highbd_dc_128_predictor_32x32_neon, aom_highbd_v_predictor_32x32_neon, aom_highbd_h_predictor_32x32_neon, aom_highbd_paeth_predictor_32x32_neon, aom_highbd_smooth_predictor_32x32_neon, aom_highbd_smooth_v_predictor_32x32_neon, aom_highbd_smooth_h_predictor_32x32_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_32x16_neon, aom_highbd_v_predictor_32x16_neon, aom_highbd_h_predictor_32x16_neon, aom_highbd_paeth_predictor_32x16_neon, aom_highbd_smooth_predictor_32x16_neon, aom_highbd_smooth_v_predictor_32x16_neon, aom_highbd_smooth_h_predictor_32x16_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_32x64_neon, aom_highbd_v_predictor_32x64_neon, aom_highbd_h_predictor_32x64_neon, aom_highbd_paeth_predictor_32x64_neon, aom_highbd_smooth_predictor_32x64_neon, aom_highbd_smooth_v_predictor_32x64_neon, aom_highbd_smooth_h_predictor_32x64_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_32x8_neon, aom_highbd_v_predictor_32x8_neon, aom_highbd_h_predictor_32x8_neon, aom_highbd_paeth_predictor_32x8_neon, @@ -1659,21 +1673,23 @@ #if HAVE_NEON HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon, - nullptr, nullptr, nullptr, + nullptr, nullptr, aom_highbd_dc_128_predictor_64x64_neon, aom_highbd_v_predictor_64x64_neon, aom_highbd_h_predictor_64x64_neon, aom_highbd_paeth_predictor_64x64_neon, aom_highbd_smooth_predictor_64x64_neon, aom_highbd_smooth_v_predictor_64x64_neon, aom_highbd_smooth_h_predictor_64x64_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_64x32_neon, aom_highbd_v_predictor_64x32_neon, aom_highbd_h_predictor_64x32_neon, aom_highbd_paeth_predictor_64x32_neon, aom_highbd_smooth_predictor_64x32_neon, aom_highbd_smooth_v_predictor_64x32_neon, aom_highbd_smooth_h_predictor_64x32_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, nullptr, +HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, + aom_highbd_dc_128_predictor_64x16_neon, aom_highbd_v_predictor_64x16_neon, aom_highbd_h_predictor_64x16_neon, aom_highbd_paeth_predictor_64x16_neon,