Add highbd_dc_128_predictor Neon implementation and tests
Add Neon implementations of highbd_dc_128_predictor for all block sizes.
Also add the corresponding tests and benchmarks.
We also take this opportunity to clean up the ordering of the specialize
calls in aom_dsp_rtcd_defs.pl and add the missing dc_128 cases for Neon.
Change-Id: I97de6e91a76c9cdda740563975424d514623eade
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e05c276..16bb4a9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -341,36 +341,47 @@
specialize qw/aom_highbd_h_predictor_64x32 neon/;
specialize qw/aom_highbd_h_predictor_64x64 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_4x16 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x32 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x4 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x64 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x8 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x64 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_64x16 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_64x32 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_64x64 neon/;
+
specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
+
+ specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
+ specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
specialize qw/aom_highbd_paeth_predictor_4x4 neon/;
specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 8363399..af212cd 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -82,6 +82,88 @@
#undef INTRA_PRED_SQUARE
// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x4_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u16(dst + i * stride, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ vst1q_u16(dst + i * stride + 16, dc);
+ vst1q_u16(dst + i * stride + 24, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ vst1q_u16(dst + i * stride + 16, dc);
+ vst1q_u16(dst + i * stride + 24, dc);
+ vst1q_u16(dst + i * stride + 32, dc);
+ vst1q_u16(dst + i * stride + 40, dc);
+ vst1q_u16(dst + i * stride + 48, dc);
+ vst1q_u16(dst + i * stride + 56, dc);
+ }
+}
+
+#define HIGHBD_DC_PREDICTOR_128(w, h, q) \
+ void aom_highbd_dc_128_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ (void)left; \
+ highbd_dc_store_##w##xh(dst, stride, (h), \
+ vdup##q##_n_u16(0x80 << (bd - 8))); \
+ }
+
+HIGHBD_DC_PREDICTOR_128(4, 4, )
+HIGHBD_DC_PREDICTOR_128(4, 8, )
+HIGHBD_DC_PREDICTOR_128(4, 16, )
+HIGHBD_DC_PREDICTOR_128(8, 4, q)
+HIGHBD_DC_PREDICTOR_128(8, 8, q)
+HIGHBD_DC_PREDICTOR_128(8, 16, q)
+HIGHBD_DC_PREDICTOR_128(8, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 4, q)
+HIGHBD_DC_PREDICTOR_128(16, 8, q)
+HIGHBD_DC_PREDICTOR_128(16, 16, q)
+HIGHBD_DC_PREDICTOR_128(16, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 64, q)
+HIGHBD_DC_PREDICTOR_128(32, 8, q)
+HIGHBD_DC_PREDICTOR_128(32, 16, q)
+HIGHBD_DC_PREDICTOR_128(32, 32, q)
+HIGHBD_DC_PREDICTOR_128(32, 64, q)
+HIGHBD_DC_PREDICTOR_128(64, 16, q)
+HIGHBD_DC_PREDICTOR_128(64, 32, q)
+HIGHBD_DC_PREDICTOR_128(64, 64, q)
+
+#undef HIGHBD_DC_PREDICTOR_128
+
+// -----------------------------------------------------------------------------
// V_PRED
#define HIGHBD_V_NXM(W, H) \
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 8839450..4ae0798 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -405,9 +405,10 @@
highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8),
highbd_entry(dc, 64, 64, neon, 8),
- highbd_intrapred(v, neon, 12), highbd_intrapred(h, neon, 12),
- highbd_intrapred(paeth, neon, 12), highbd_intrapred(smooth, neon, 12),
- highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12),
+ highbd_intrapred(dc_128, neon, 12), highbd_intrapred(v, neon, 12),
+ highbd_intrapred(h, neon, 12), highbd_intrapred(paeth, neon, 12),
+ highbd_intrapred(smooth, neon, 12), highbd_intrapred(smooth_v, neon, 12),
+ highbd_intrapred(smooth_h, neon, 12),
};
INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 05462a2..8812bed 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -1306,20 +1306,23 @@
#endif
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon, nullptr,
- nullptr, nullptr, aom_highbd_v_predictor_4x4_neon,
+ nullptr, aom_highbd_dc_128_predictor_4x4_neon,
+ aom_highbd_v_predictor_4x4_neon,
aom_highbd_h_predictor_4x4_neon,
aom_highbd_paeth_predictor_4x4_neon,
aom_highbd_smooth_predictor_4x4_neon,
aom_highbd_smooth_v_predictor_4x4_neon,
aom_highbd_smooth_h_predictor_4x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_4x8_neon,
aom_highbd_v_predictor_4x8_neon,
aom_highbd_h_predictor_4x8_neon,
aom_highbd_paeth_predictor_4x8_neon,
aom_highbd_smooth_predictor_4x8_neon,
aom_highbd_smooth_v_predictor_4x8_neon,
aom_highbd_smooth_h_predictor_4x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_4x16_neon,
aom_highbd_v_predictor_4x16_neon,
aom_highbd_h_predictor_4x16_neon,
aom_highbd_paeth_predictor_4x16_neon,
@@ -1391,27 +1394,31 @@
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon, nullptr,
- nullptr, nullptr, aom_highbd_v_predictor_8x8_neon,
+ nullptr, aom_highbd_dc_128_predictor_8x8_neon,
+ aom_highbd_v_predictor_8x8_neon,
aom_highbd_h_predictor_8x8_neon,
aom_highbd_paeth_predictor_8x8_neon,
aom_highbd_smooth_predictor_8x8_neon,
aom_highbd_smooth_v_predictor_8x8_neon,
aom_highbd_smooth_h_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_8x4_neon,
aom_highbd_v_predictor_8x4_neon,
aom_highbd_h_predictor_8x4_neon,
aom_highbd_paeth_predictor_8x4_neon,
aom_highbd_smooth_predictor_8x4_neon,
aom_highbd_smooth_v_predictor_8x4_neon,
aom_highbd_smooth_h_predictor_8x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_8x16_neon,
aom_highbd_v_predictor_8x16_neon,
aom_highbd_h_predictor_8x16_neon,
aom_highbd_paeth_predictor_8x16_neon,
aom_highbd_smooth_predictor_8x16_neon,
aom_highbd_smooth_v_predictor_8x16_neon,
aom_highbd_smooth_h_predictor_8x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_8x32_neon,
aom_highbd_v_predictor_8x32_neon,
aom_highbd_h_predictor_8x32_neon,
aom_highbd_paeth_predictor_8x32_neon,
@@ -1501,35 +1508,39 @@
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon,
- nullptr, nullptr, nullptr,
+ nullptr, nullptr, aom_highbd_dc_128_predictor_16x16_neon,
aom_highbd_v_predictor_16x16_neon,
aom_highbd_h_predictor_16x16_neon,
aom_highbd_paeth_predictor_16x16_neon,
aom_highbd_smooth_predictor_16x16_neon,
aom_highbd_smooth_v_predictor_16x16_neon,
aom_highbd_smooth_h_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_16x8_neon,
aom_highbd_v_predictor_16x8_neon,
aom_highbd_h_predictor_16x8_neon,
aom_highbd_paeth_predictor_16x8_neon,
aom_highbd_smooth_predictor_16x8_neon,
aom_highbd_smooth_v_predictor_16x8_neon,
aom_highbd_smooth_h_predictor_16x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_16x32_neon,
aom_highbd_v_predictor_16x32_neon,
aom_highbd_h_predictor_16x32_neon,
aom_highbd_paeth_predictor_16x32_neon,
aom_highbd_smooth_predictor_16x32_neon,
aom_highbd_smooth_v_predictor_16x32_neon,
aom_highbd_smooth_h_predictor_16x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_16x4_neon,
aom_highbd_v_predictor_16x4_neon,
aom_highbd_h_predictor_16x4_neon,
aom_highbd_paeth_predictor_16x4_neon,
aom_highbd_smooth_predictor_16x4_neon,
aom_highbd_smooth_v_predictor_16x4_neon,
aom_highbd_smooth_h_predictor_16x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_16x64_neon,
aom_highbd_v_predictor_16x64_neon,
aom_highbd_h_predictor_16x64_neon,
aom_highbd_paeth_predictor_16x64_neon,
@@ -1602,28 +1613,31 @@
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon,
- nullptr, nullptr, nullptr,
+ nullptr, nullptr, aom_highbd_dc_128_predictor_32x32_neon,
aom_highbd_v_predictor_32x32_neon,
aom_highbd_h_predictor_32x32_neon,
aom_highbd_paeth_predictor_32x32_neon,
aom_highbd_smooth_predictor_32x32_neon,
aom_highbd_smooth_v_predictor_32x32_neon,
aom_highbd_smooth_h_predictor_32x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_32x16_neon,
aom_highbd_v_predictor_32x16_neon,
aom_highbd_h_predictor_32x16_neon,
aom_highbd_paeth_predictor_32x16_neon,
aom_highbd_smooth_predictor_32x16_neon,
aom_highbd_smooth_v_predictor_32x16_neon,
aom_highbd_smooth_h_predictor_32x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_32x64_neon,
aom_highbd_v_predictor_32x64_neon,
aom_highbd_h_predictor_32x64_neon,
aom_highbd_paeth_predictor_32x64_neon,
aom_highbd_smooth_predictor_32x64_neon,
aom_highbd_smooth_v_predictor_32x64_neon,
aom_highbd_smooth_h_predictor_32x64_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_32x8_neon,
aom_highbd_v_predictor_32x8_neon,
aom_highbd_h_predictor_32x8_neon,
aom_highbd_paeth_predictor_32x8_neon,
@@ -1659,21 +1673,23 @@
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon,
- nullptr, nullptr, nullptr,
+ nullptr, nullptr, aom_highbd_dc_128_predictor_64x64_neon,
aom_highbd_v_predictor_64x64_neon,
aom_highbd_h_predictor_64x64_neon,
aom_highbd_paeth_predictor_64x64_neon,
aom_highbd_smooth_predictor_64x64_neon,
aom_highbd_smooth_v_predictor_64x64_neon,
aom_highbd_smooth_h_predictor_64x64_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_64x32_neon,
aom_highbd_v_predictor_64x32_neon,
aom_highbd_h_predictor_64x32_neon,
aom_highbd_paeth_predictor_64x32_neon,
aom_highbd_smooth_predictor_64x32_neon,
aom_highbd_smooth_v_predictor_64x32_neon,
aom_highbd_smooth_h_predictor_64x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr,
+ aom_highbd_dc_128_predictor_64x16_neon,
aom_highbd_v_predictor_64x16_neon,
aom_highbd_h_predictor_64x16_neon,
aom_highbd_paeth_predictor_64x16_neon,