intrapred_neon: add aom_smooth_v_predictor_NxM_neon
ported from libgav1 @ v0.17.0-83-ge54abf5c
Bug: b/231616924
Change-Id: I813d48e386823b457085370cf71776609b405243
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index c82b622..0b19b92 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -183,19 +183,19 @@
specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
-specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/;
specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
@@ -260,12 +260,12 @@
specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/;
specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index bdc6a29..db383f2 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -2884,6 +2884,159 @@
#undef SMOOTH_NXM_WIDE
// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_V_PREDICTOR(W) \
+ static void smooth_v_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x8_t UNINITIALIZED_IS_SAFE(top_v); \
+ if ((W) == 4) { \
+ load_u8_4x1(top_row, &top_v, 0); \
+ } else { /* width == 8 */ \
+ top_v = vld1_u8(top_row); \
+ } \
+ \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ for (int y = 0; y < height; ++y) { \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ \
+ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \
+ const uint16x8_t weighted_top_bl = \
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \
+ const uint8x8_t pred = \
+ vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \
+ \
+ if ((W) == 4) { \
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+ } else { /* width == 8 */ \
+ vst1_u8(dst, pred); \
+ } \
+ dst += stride; \
+ } \
+ }
+
+SMOOTH_V_PREDICTOR(4)
+SMOOTH_V_PREDICTOR(8)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM(W, H) \
+ void aom_smooth_v_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_V_NXM(4, 4)
+SMOOTH_V_NXM(4, 8)
+SMOOTH_V_NXM(4, 16)
+SMOOTH_V_NXM(8, 4)
+SMOOTH_V_NXM(8, 8)
+SMOOTH_V_NXM(8, 16)
+SMOOTH_V_NXM(8, 32)
+
+#undef SMOOTH_V_NXM
+
+static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+ const uint8x16_t top, const uint8x8_t weights_y,
+ const uint16x8_t weighted_bl) {
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_V_PREDICTOR(W) \
+ static void smooth_v_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x16_t top_v[4]; \
+ top_v[0] = vld1q_u8(top_row); \
+ if ((W) > 16) { \
+ top_v[1] = vld1q_u8(top_row + 16); \
+ if ((W) == 64) { \
+ top_v[2] = vld1q_u8(top_row + 32); \
+ top_v[3] = vld1q_u8(top_row + 48); \
+ } \
+ } \
+ \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ for (int y = 0; y < height; ++y) { \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ const uint16x8_t weighted_bl = \
+ vmull_u8(scaled_weights_y, bottom_left_v); \
+ \
+ const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \
+ top_v[0], weights_y_v, weighted_bl); \
+ vst1q_u8(dst, pred_0); \
+ \
+ if ((W) > 16) { \
+ const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \
+ top_v[1], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 16, pred_1); \
+ \
+ if ((W) == 64) { \
+ const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
+ top_v[2], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 32, pred_2); \
+ \
+ const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
+ top_v[3], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 48, pred_3); \
+ } \
+ } \
+ \
+ dst += stride; \
+ } \
+ }
+
+SMOOTH_V_PREDICTOR(16)
+SMOOTH_V_PREDICTOR(32)
+SMOOTH_V_PREDICTOR(64)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM_WIDE(W, H) \
+ void aom_smooth_v_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_V_NXM_WIDE(16, 4)
+SMOOTH_V_NXM_WIDE(16, 8)
+SMOOTH_V_NXM_WIDE(16, 16)
+SMOOTH_V_NXM_WIDE(16, 32)
+SMOOTH_V_NXM_WIDE(16, 64)
+SMOOTH_V_NXM_WIDE(32, 8)
+SMOOTH_V_NXM_WIDE(32, 16)
+SMOOTH_V_NXM_WIDE(32, 32)
+SMOOTH_V_NXM_WIDE(32, 64)
+SMOOTH_V_NXM_WIDE(64, 16)
+SMOOTH_V_NXM_WIDE(64, 32)
+SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef SMOOTH_V_NXM_WIDE
+
+// -----------------------------------------------------------------------------
// PAETH
static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index a219427..f96906d 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -333,25 +333,38 @@
#if HAVE_NEON
const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
- lowbd_entry(smooth, 4, 4, neon), lowbd_entry(smooth, 4, 8, neon),
- lowbd_entry(smooth, 4, 16, neon), lowbd_entry(smooth, 8, 4, neon),
- lowbd_entry(smooth, 8, 8, neon), lowbd_entry(smooth, 8, 16, neon),
- lowbd_entry(smooth, 8, 32, neon), lowbd_entry(smooth, 16, 4, neon),
- lowbd_entry(smooth, 16, 8, neon), lowbd_entry(smooth, 16, 16, neon),
- lowbd_entry(smooth, 16, 32, neon), lowbd_entry(smooth, 16, 64, neon),
- lowbd_entry(smooth, 32, 8, neon), lowbd_entry(smooth, 32, 16, neon),
- lowbd_entry(smooth, 32, 32, neon), lowbd_entry(smooth, 32, 64, neon),
- lowbd_entry(smooth, 64, 16, neon), lowbd_entry(smooth, 64, 32, neon),
- lowbd_entry(smooth, 64, 64, neon), lowbd_entry(paeth, 4, 4, neon),
- lowbd_entry(paeth, 4, 8, neon), lowbd_entry(paeth, 4, 16, neon),
- lowbd_entry(paeth, 8, 4, neon), lowbd_entry(paeth, 8, 8, neon),
- lowbd_entry(paeth, 8, 16, neon), lowbd_entry(paeth, 8, 32, neon),
- lowbd_entry(paeth, 16, 4, neon), lowbd_entry(paeth, 16, 8, neon),
- lowbd_entry(paeth, 16, 16, neon), lowbd_entry(paeth, 16, 32, neon),
- lowbd_entry(paeth, 16, 64, neon), lowbd_entry(paeth, 32, 8, neon),
- lowbd_entry(paeth, 32, 16, neon), lowbd_entry(paeth, 32, 32, neon),
- lowbd_entry(paeth, 32, 64, neon), lowbd_entry(paeth, 64, 16, neon),
- lowbd_entry(paeth, 64, 32, neon), lowbd_entry(paeth, 64, 64, neon),
+ lowbd_entry(smooth, 4, 4, neon), lowbd_entry(smooth, 4, 8, neon),
+ lowbd_entry(smooth, 4, 16, neon), lowbd_entry(smooth, 8, 4, neon),
+ lowbd_entry(smooth, 8, 8, neon), lowbd_entry(smooth, 8, 16, neon),
+ lowbd_entry(smooth, 8, 32, neon), lowbd_entry(smooth, 16, 4, neon),
+ lowbd_entry(smooth, 16, 8, neon), lowbd_entry(smooth, 16, 16, neon),
+ lowbd_entry(smooth, 16, 32, neon), lowbd_entry(smooth, 16, 64, neon),
+ lowbd_entry(smooth, 32, 8, neon), lowbd_entry(smooth, 32, 16, neon),
+ lowbd_entry(smooth, 32, 32, neon), lowbd_entry(smooth, 32, 64, neon),
+ lowbd_entry(smooth, 64, 16, neon), lowbd_entry(smooth, 64, 32, neon),
+ lowbd_entry(smooth, 64, 64, neon),
+
+ lowbd_entry(smooth_v, 4, 4, neon), lowbd_entry(smooth_v, 4, 8, neon),
+ lowbd_entry(smooth_v, 4, 16, neon), lowbd_entry(smooth_v, 8, 4, neon),
+ lowbd_entry(smooth_v, 8, 8, neon), lowbd_entry(smooth_v, 8, 16, neon),
+ lowbd_entry(smooth_v, 8, 32, neon), lowbd_entry(smooth_v, 16, 4, neon),
+ lowbd_entry(smooth_v, 16, 8, neon), lowbd_entry(smooth_v, 16, 16, neon),
+ lowbd_entry(smooth_v, 16, 32, neon), lowbd_entry(smooth_v, 16, 64, neon),
+ lowbd_entry(smooth_v, 32, 8, neon), lowbd_entry(smooth_v, 32, 16, neon),
+ lowbd_entry(smooth_v, 32, 32, neon), lowbd_entry(smooth_v, 32, 64, neon),
+ lowbd_entry(smooth_v, 64, 16, neon), lowbd_entry(smooth_v, 64, 32, neon),
+ lowbd_entry(smooth_v, 64, 64, neon),
+
+ lowbd_entry(paeth, 4, 4, neon), lowbd_entry(paeth, 4, 8, neon),
+ lowbd_entry(paeth, 4, 16, neon), lowbd_entry(paeth, 8, 4, neon),
+ lowbd_entry(paeth, 8, 8, neon), lowbd_entry(paeth, 8, 16, neon),
+ lowbd_entry(paeth, 8, 32, neon), lowbd_entry(paeth, 16, 4, neon),
+ lowbd_entry(paeth, 16, 8, neon), lowbd_entry(paeth, 16, 16, neon),
+ lowbd_entry(paeth, 16, 32, neon), lowbd_entry(paeth, 16, 64, neon),
+ lowbd_entry(paeth, 32, 8, neon), lowbd_entry(paeth, 32, 16, neon),
+ lowbd_entry(paeth, 32, 32, neon), lowbd_entry(paeth, 32, 64, neon),
+ lowbd_entry(paeth, 64, 16, neon), lowbd_entry(paeth, 64, 32, neon),
+ lowbd_entry(paeth, 64, 64, neon),
};
INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 50cc542..cbaca11 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -468,13 +468,14 @@
aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
aom_h_predictor_4x4_neon, aom_paeth_predictor_4x4_neon,
- aom_smooth_predictor_4x4_neon, NULL, NULL)
+ aom_smooth_predictor_4x4_neon, aom_smooth_v_predictor_4x4_neon,
+ NULL)
INTRA_PRED_TEST(NEON, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_4x8_neon, NULL)
INTRA_PRED_TEST(NEON, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_4x16_neon, NULL)
#endif // HAVE_NEON
#if HAVE_MSA
@@ -559,16 +560,17 @@
aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
aom_h_predictor_8x8_neon, aom_paeth_predictor_8x8_neon,
- aom_smooth_predictor_8x8_neon, NULL, NULL)
+ aom_smooth_predictor_8x8_neon, aom_smooth_v_predictor_8x8_neon,
+ NULL)
INTRA_PRED_TEST(NEON, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_8x4_neon, NULL)
INTRA_PRED_TEST(NEON, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_8x16_neon, NULL)
INTRA_PRED_TEST(NEON, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_8x32_neon, NULL)
#endif // HAVE_NEON
#if HAVE_MSA
@@ -686,19 +688,20 @@
aom_dc_top_predictor_16x16_neon,
aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
aom_h_predictor_16x16_neon, aom_paeth_predictor_16x16_neon,
- aom_smooth_predictor_16x16_neon, NULL, NULL)
+ aom_smooth_predictor_16x16_neon,
+ aom_smooth_v_predictor_16x16_neon, NULL)
INTRA_PRED_TEST(NEON, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_16x8_neon, NULL)
INTRA_PRED_TEST(NEON, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_16x32_neon, NULL)
INTRA_PRED_TEST(NEON, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_16x4_neon, NULL)
INTRA_PRED_TEST(NEON, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_16x64_neon, NULL)
#endif // HAVE_NEON
#if HAVE_MSA
@@ -805,16 +808,17 @@
aom_dc_top_predictor_32x32_neon,
aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
aom_h_predictor_32x32_neon, aom_paeth_predictor_32x32_neon,
- aom_smooth_predictor_32x32_neon, NULL, NULL)
+ aom_smooth_predictor_32x32_neon,
+ aom_smooth_v_predictor_32x32_neon, NULL)
INTRA_PRED_TEST(NEON, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_32x16_neon, NULL)
INTRA_PRED_TEST(NEON, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_32x64_neon, NULL)
INTRA_PRED_TEST(NEON, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_32x8_neon, NULL)
#endif // HAVE_NEON
#if HAVE_MSA
@@ -903,13 +907,13 @@
#if HAVE_NEON
INTRA_PRED_TEST(NEON, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_64x64_neon, NULL)
INTRA_PRED_TEST(NEON, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_64x32_neon, NULL)
INTRA_PRED_TEST(NEON, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
- NULL, NULL)
+ aom_smooth_v_predictor_64x16_neon, NULL)
#endif // HAVE_NEON
#if CONFIG_AV1_HIGHBITDEPTH