Add Neon implementations for rectangular dc predictors
We already have Neon implementations of the dc predictors for all square
block sizes, so add the remaining cases and update tests/speed to match.
On Neoverse V1, this gives a small improvement over the C code when
built with Clang 15 (~3.5%) and a significant improvement when built
with GCC 12 (~30%).
Change-Id: Iab9fed1610c40dbb1698abed4156bf01806919ed
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index cbcc24d..dab303c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -86,63 +86,63 @@
}
specialize qw/aom_dc_top_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x4 sse2/;
+specialize qw/aom_dc_top_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x4 neon sse2/;
specialize qw/aom_dc_top_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
-specialize qw/aom_dc_top_predictor_16x8 sse2/;
+specialize qw/aom_dc_top_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x8 neon sse2/;
specialize qw/aom_dc_top_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
-specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x16 neon sse2 avx2/;
specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 neon sse2 avx2/;
specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/;
specialize qw/aom_dc_left_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x4 sse2/;
+specialize qw/aom_dc_left_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x4 neon sse2/;
specialize qw/aom_dc_left_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
-specialize qw/aom_dc_left_predictor_16x8 sse2/;
+specialize qw/aom_dc_left_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x8 neon sse2/;
specialize qw/aom_dc_left_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
-specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x16 neon sse2 avx2/;
specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 neon sse2 avx2/;
specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/;
specialize qw/aom_dc_128_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x4 sse2/;
+specialize qw/aom_dc_128_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x4 neon sse2/;
specialize qw/aom_dc_128_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
-specialize qw/aom_dc_128_predictor_16x8 sse2/;
+specialize qw/aom_dc_128_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x8 neon sse2/;
specialize qw/aom_dc_128_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
-specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x16 neon sse2 avx2/;
specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 neon sse2 avx2/;
specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
specialize qw/aom_v_predictor_4x4 neon sse2/;
@@ -268,24 +268,24 @@
# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
# by multiply and shift.
specialize qw/aom_dc_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_dc_predictor_4x16 sse2/;
-specialize qw/aom_dc_predictor_8x4 sse2/;
+specialize qw/aom_dc_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x4 neon sse2/;
specialize qw/aom_dc_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_dc_predictor_8x32 sse2/;
-specialize qw/aom_dc_predictor_16x4 sse2/;
-specialize qw/aom_dc_predictor_16x8 sse2/;
+specialize qw/aom_dc_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_predictor_16x8 neon sse2/;
specialize qw/aom_dc_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_dc_predictor_16x64 sse2/;
-specialize qw/aom_dc_predictor_32x8 sse2/;
-specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_predictor_32x16 neon sse2 avx2/;
specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 neon sse2 avx2/;
specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/;
-specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 neon sse2 avx2/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/;
specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 5f43802..c724fa5 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -24,15 +24,15 @@
// DC 4x4
static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
- const uint8x8_t a = vld1_u8(in);
+ const uint8x8_t a = load_u8_4x1_lane0(in);
const uint16x4_t p0 = vpaddl_u8(a);
const uint16x4_t p1 = vpadd_u16(p0, p0);
return vcombine_u16(p1, vdup_n_u16(0));
}
-static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, uint8x8_t dc0) {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- for (int i = 0; i < 4; ++i) {
+static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t dc) {
+ for (int i = 0; i < h; ++i) {
store_u8_4x1(dst + i * stride, dc, 0);
}
}
@@ -43,7 +43,7 @@
const uint16x8_t sum_left = dc_load_sum_4(left);
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
- dc_store_4x4(dst, stride, dc0);
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
}
void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -51,7 +51,7 @@
const uint16x8_t sum_left = dc_load_sum_4(left);
const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
(void)above;
- dc_store_4x4(dst, stride, dc0);
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
}
void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -59,7 +59,7 @@
const uint16x8_t sum_top = dc_load_sum_4(above);
const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
(void)left;
- dc_store_4x4(dst, stride, dc0);
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
}
void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -67,7 +67,7 @@
const uint8x8_t dc0 = vdup_n_u8(0x80);
(void)above;
(void)left;
- dc_store_4x4(dst, stride, dc0);
+ dc_store_4xh(dst, stride, 4, dc0);
}
//------------------------------------------------------------------------------
@@ -100,9 +100,9 @@
#endif
}
-static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t dc0) {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- for (int i = 0; i < 8; ++i) {
+static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t dc) {
+ for (int i = 0; i < h; ++i) {
vst1_u8(dst + i * stride, dc);
}
}
@@ -114,7 +114,7 @@
uint16x8_t sum = vaddl_u8(sum_left, sum_top);
sum = horizontal_add_and_broadcast_u16x8(sum);
const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
- dc_store_8x8(dst, stride, dc0);
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
}
void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -122,7 +122,7 @@
const uint16x8_t sum_left = dc_load_sum_8(left);
const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
(void)above;
- dc_store_8x8(dst, stride, dc0);
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
}
void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -130,7 +130,7 @@
const uint16x8_t sum_top = dc_load_sum_8(above);
const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
(void)left;
- dc_store_8x8(dst, stride, dc0);
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
}
void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -138,13 +138,13 @@
const uint8x8_t dc0 = vdup_n_u8(0x80);
(void)above;
(void)left;
- dc_store_8x8(dst, stride, dc0);
+ dc_store_8xh(dst, stride, 8, dc0);
}
//------------------------------------------------------------------------------
// DC 16x16
-static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
const uint8x16_t a = vld1q_u8(in);
// delay the remainder of the reduction until
// horizontal_add_and_broadcast_u16x8, since we want to do it once rather
@@ -152,57 +152,58 @@
return vpaddlq_u8(a);
}
-static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
- uint8x8_t dc0) {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- for (int i = 0; i < 16; ++i) {
+static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
+}
+
+static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
vst1q_u8(dst + i * stride, dc);
}
}
void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint16x8_t sum_top = dc_load_sum_16(above);
- const uint16x8_t sum_left = dc_load_sum_16(left);
+ const uint16x8_t sum_top = dc_load_partial_sum_16(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_16(left);
uint16x8_t sum = vaddq_u16(sum_left, sum_top);
sum = horizontal_add_and_broadcast_u16x8(sum);
const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
- dc_store_16x16(dst, stride, dc0);
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
}
void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- uint16x8_t sum_left = dc_load_sum_16(left);
- sum_left = horizontal_add_and_broadcast_u16x8(sum_left);
+ const uint16x8_t sum_left = dc_load_sum_16(left);
const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
(void)above;
- dc_store_16x16(dst, stride, dc0);
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
}
void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- uint16x8_t sum_top = dc_load_sum_16(above);
- sum_top = horizontal_add_and_broadcast_u16x8(sum_top);
+ const uint16x8_t sum_top = dc_load_sum_16(above);
const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
(void)left;
- dc_store_16x16(dst, stride, dc0);
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
}
void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- const uint8x8_t dc0 = vdup_n_u8(0x80);
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
(void)above;
(void)left;
- dc_store_16x16(dst, stride, dc0);
+ dc_store_16xh(dst, stride, 16, dc0);
}
//------------------------------------------------------------------------------
// DC 32x32
-static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
const uint8x16_t a0 = vld1q_u8(in);
const uint8x16_t a1 = vld1q_u8(in + 16);
// delay the remainder of the reduction until
@@ -211,10 +212,13 @@
return vpadalq_u8(vpaddlq_u8(a0), a1);
}
-static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
- uint8x8_t dc0) {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- for (int i = 0; i < 32; ++i) {
+static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
+}
+
+static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
vst1q_u8(dst + i * stride, dc);
vst1q_u8(dst + i * stride + 16, dc);
}
@@ -222,47 +226,45 @@
void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint16x8_t sum_top = dc_load_sum_32(above);
- const uint16x8_t sum_left = dc_load_sum_32(left);
+ const uint16x8_t sum_top = dc_load_partial_sum_32(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_32(left);
uint16x8_t sum = vaddq_u16(sum_left, sum_top);
sum = horizontal_add_and_broadcast_u16x8(sum);
const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
- dc_store_32x32(dst, stride, dc0);
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
}
void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- uint16x8_t sum_left = dc_load_sum_32(left);
- sum_left = horizontal_add_and_broadcast_u16x8(sum_left);
+ const uint16x8_t sum_left = dc_load_sum_32(left);
const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
(void)above;
- dc_store_32x32(dst, stride, dc0);
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
}
void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- uint16x8_t sum_top = dc_load_sum_32(above);
- sum_top = horizontal_add_and_broadcast_u16x8(sum_top);
+ const uint16x8_t sum_top = dc_load_sum_32(above);
const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
(void)left;
- dc_store_32x32(dst, stride, dc0);
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
}
void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- const uint8x8_t dc0 = vdup_n_u8(0x80);
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
(void)above;
(void)left;
- dc_store_32x32(dst, stride, dc0);
+ dc_store_32xh(dst, stride, 32, dc0);
}
//------------------------------------------------------------------------------
// DC 64x64
-static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
const uint8x16_t a0 = vld1q_u8(in);
const uint8x16_t a1 = vld1q_u8(in + 16);
const uint8x16_t a2 = vld1q_u8(in + 32);
@@ -275,10 +277,13 @@
return vaddq_u16(p01, p23);
}
-static INLINE void dc_store_64x64(uint8_t *dst, ptrdiff_t stride,
- uint8x8_t dc0) {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- for (int i = 0; i < 64; ++i) {
+static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
+}
+
+static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
vst1q_u8(dst + i * stride, dc);
vst1q_u8(dst + i * stride + 16, dc);
vst1q_u8(dst + i * stride + 32, dc);
@@ -288,43 +293,286 @@
void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint16x8_t sum_top = dc_load_sum_64(above);
- const uint16x8_t sum_left = dc_load_sum_64(left);
+ const uint16x8_t sum_top = dc_load_partial_sum_64(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_64(left);
uint16x8_t sum = vaddq_u16(sum_left, sum_top);
sum = horizontal_add_and_broadcast_u16x8(sum);
const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
- dc_store_64x64(dst, stride, dc0);
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
}
void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- uint16x8_t sum_left = dc_load_sum_64(left);
- sum_left = horizontal_add_and_broadcast_u16x8(sum_left);
+ const uint16x8_t sum_left = dc_load_sum_64(left);
const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
(void)above;
- dc_store_64x64(dst, stride, dc0);
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
}
void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- uint16x8_t sum_top = dc_load_sum_64(above);
- sum_top = horizontal_add_and_broadcast_u16x8(sum_top);
+ const uint16x8_t sum_top = dc_load_sum_64(above);
const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
(void)left;
- dc_store_64x64(dst, stride, dc0);
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
}
void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
- const uint8x8_t dc0 = vdup_n_u8(0x80);
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
(void)above;
(void)left;
- dc_store_64x64(dst, stride, dc0);
+ dc_store_64xh(dst, stride, 64, dc0);
}
+//------------------------------------------------------------------------------
+// DC rectangular cases
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier, int shift2) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> shift2;
+}
+
+static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
+ int shift1, int multiplier) {
+ const int expected_dc = divide_using_multiply_shift(
+ sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+ assert(expected_dc < (1 << 8));
+ return expected_dc;
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = load_u8_4x1_lane0(above);
+ uint8x8_t l = vld1_u8(left);
+ uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+ uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
+ dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint8x8_t l = load_u8_4x1_lane0(left);
+ uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+ uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
+ dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = load_u8_4x1_lane0(above);
+ uint8x16_t l = vld1q_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
+ dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a = vld1q_u8(above);
+ uint8x8_t l = load_u8_4x1_lane0(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
+ dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint8x16_t l = vld1q_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
+ dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a = vld1q_u8(above);
+ uint8x8_t l = vld1_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
+ dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddw_u8(sum_left, a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
+ dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_top = dc_load_partial_sum_32(above);
+ uint8x8_t l = vld1_u8(left);
+ uint16x8_t sum_al = vaddw_u8(sum_top, l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
+ dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_16(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
+ dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_32(above);
+ uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
+ dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_16(above);
+ uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
+ dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_64(above);
+ uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
+ dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_32(above);
+ uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
+ dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_64(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
+ dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+#define DC_PREDICTOR_128(w, h, q) \
+ void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)above; \
+ (void)left; \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \
+ }
+
+DC_PREDICTOR_128(4, 8, )
+DC_PREDICTOR_128(4, 16, )
+DC_PREDICTOR_128(8, 4, )
+DC_PREDICTOR_128(8, 16, )
+DC_PREDICTOR_128(8, 32, )
+DC_PREDICTOR_128(16, 4, q)
+DC_PREDICTOR_128(16, 8, q)
+DC_PREDICTOR_128(16, 32, q)
+DC_PREDICTOR_128(16, 64, q)
+DC_PREDICTOR_128(32, 8, q)
+DC_PREDICTOR_128(32, 16, q)
+DC_PREDICTOR_128(32, 64, q)
+DC_PREDICTOR_128(64, 32, q)
+DC_PREDICTOR_128(64, 16, q)
+
+#undef DC_PREDICTOR_128
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q) \
+ void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)above; \
+ const uint16x8_t sum = dc_load_sum_##h(left); \
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
+ }
+
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(8, 4, 2, )
+DC_PREDICTOR_LEFT(8, 16, 4, )
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 32, 5, )
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+
+#undef DC_PREDICTOR_LEFT
+
+#define DC_PREDICTOR_TOP(w, h, shift, q) \
+ void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)left; \
+ const uint16x8_t sum = dc_load_sum_##w(above); \
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
+ }
+
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, )
+DC_PREDICTOR_TOP(8, 16, 3, )
+DC_PREDICTOR_TOP(8, 32, 3, )
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
// -----------------------------------------------------------------------------
void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 58f5226..30328f7 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -94,6 +94,13 @@
vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
} while (0)
+// Load four bytes into the low half of a uint8x8_t, zero the upper half.
+static INLINE uint8x8_t load_u8_4x1_lane0(const uint8_t *p) {
+ uint8x8_t ret = vdup_n_u8(0);
+ load_u8_4x1(p, &ret, 0);
+ return ret;
+}
+
// Load 2 sets of 4 bytes when alignment is guaranteed.
static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
uint32x2_t a = vdup_n_u32(0);
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 4804476..01a5dc2 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -340,30 +340,17 @@
#if HAVE_NEON
const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
- lowbd_entry(dc, 4, 4, neon), lowbd_entry(dc, 8, 8, neon),
- lowbd_entry(dc, 16, 16, neon), lowbd_entry(dc, 32, 32, neon),
- lowbd_entry(dc, 64, 64, neon),
+ lowbd_intrapred(dc, neon), lowbd_intrapred(dc_top, neon),
+ lowbd_intrapred(dc_left, neon), lowbd_intrapred(dc_128, neon),
- lowbd_entry(dc_top, 4, 4, neon), lowbd_entry(dc_top, 8, 8, neon),
- lowbd_entry(dc_top, 16, 16, neon), lowbd_entry(dc_top, 32, 32, neon),
- lowbd_entry(dc_top, 64, 64, neon),
+ lowbd_entry(v, 4, 4, neon), lowbd_entry(v, 8, 8, neon),
+ lowbd_entry(v, 16, 16, neon), lowbd_entry(v, 32, 32, neon),
- lowbd_entry(dc_left, 4, 4, neon), lowbd_entry(dc_left, 8, 8, neon),
- lowbd_entry(dc_left, 16, 16, neon), lowbd_entry(dc_left, 32, 32, neon),
- lowbd_entry(dc_left, 64, 64, neon),
+ lowbd_entry(h, 4, 4, neon), lowbd_entry(h, 8, 8, neon),
+ lowbd_entry(h, 16, 16, neon), lowbd_entry(h, 32, 32, neon),
- lowbd_entry(dc_128, 4, 4, neon), lowbd_entry(dc_128, 8, 8, neon),
- lowbd_entry(dc_128, 16, 16, neon), lowbd_entry(dc_128, 32, 32, neon),
- lowbd_entry(dc_128, 64, 64, neon),
-
- lowbd_entry(v, 4, 4, neon), lowbd_entry(v, 8, 8, neon),
- lowbd_entry(v, 16, 16, neon), lowbd_entry(v, 32, 32, neon),
-
- lowbd_entry(h, 4, 4, neon), lowbd_entry(h, 8, 8, neon),
- lowbd_entry(h, 16, 16, neon), lowbd_entry(h, 32, 32, neon),
-
- lowbd_intrapred(smooth, neon), lowbd_intrapred(smooth_v, neon),
- lowbd_intrapred(smooth_h, neon), lowbd_intrapred(paeth, neon),
+ lowbd_intrapred(smooth, neon), lowbd_intrapred(smooth_v, neon),
+ lowbd_intrapred(smooth_h, neon), lowbd_intrapred(paeth, neon),
};
INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 82765ce..526f678 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -468,13 +468,16 @@
aom_h_predictor_4x4_neon, aom_paeth_predictor_4x4_neon,
aom_smooth_predictor_4x4_neon, aom_smooth_v_predictor_4x4_neon,
aom_smooth_h_predictor_4x4_neon)
-INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_4x8_neon,
- aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
+INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
+ aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
+ aom_dc_128_predictor_4x8_neon, nullptr, nullptr,
+ aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
+ aom_smooth_v_predictor_4x8_neon,
aom_smooth_h_predictor_4x8_neon)
-INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_4x16_neon,
- aom_smooth_predictor_4x16_neon,
+INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
+ aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
+ aom_dc_128_predictor_4x16_neon, nullptr, nullptr,
+ aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
aom_smooth_v_predictor_4x16_neon,
aom_smooth_h_predictor_4x16_neon)
#endif // HAVE_NEON
@@ -555,18 +558,22 @@
aom_h_predictor_8x8_neon, aom_paeth_predictor_8x8_neon,
aom_smooth_predictor_8x8_neon, aom_smooth_v_predictor_8x8_neon,
aom_smooth_h_predictor_8x8_neon)
-INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_8x4_neon,
- aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
+INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
+ aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
+ aom_dc_128_predictor_8x4_neon, nullptr, nullptr,
+ aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
+ aom_smooth_v_predictor_8x4_neon,
aom_smooth_h_predictor_8x4_neon)
-INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_8x16_neon,
- aom_smooth_predictor_8x16_neon,
+INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
+ aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
+ aom_dc_128_predictor_8x16_neon, nullptr, nullptr,
+ aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
aom_smooth_v_predictor_8x16_neon,
aom_smooth_h_predictor_8x16_neon)
-INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_8x32_neon,
- aom_smooth_predictor_8x32_neon,
+INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
+ aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
+ aom_dc_128_predictor_8x32_neon, nullptr, nullptr,
+ aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
aom_smooth_v_predictor_8x32_neon,
aom_smooth_h_predictor_8x32_neon)
#endif // HAVE_NEON
@@ -683,24 +690,30 @@
aom_smooth_predictor_16x16_neon,
aom_smooth_v_predictor_16x16_neon,
aom_smooth_h_predictor_16x16_neon)
-INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_16x8_neon,
- aom_smooth_predictor_16x8_neon,
+INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
+ aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
+ aom_dc_128_predictor_16x8_neon, nullptr, nullptr,
+ aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
aom_smooth_v_predictor_16x8_neon,
aom_smooth_h_predictor_16x8_neon)
-INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_16x32_neon,
- aom_smooth_predictor_16x32_neon,
+INTRA_PRED_TEST(NEON, TX_16X32, aom_dc_predictor_16x32_neon,
+ aom_dc_left_predictor_16x32_neon,
+ aom_dc_top_predictor_16x32_neon,
+ aom_dc_128_predictor_16x32_neon, nullptr, nullptr,
+ aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
aom_smooth_v_predictor_16x32_neon,
aom_smooth_h_predictor_16x32_neon)
-INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_16x4_neon,
- aom_smooth_predictor_16x4_neon,
+INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
+ aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
+ aom_dc_128_predictor_16x4_neon, nullptr, nullptr,
+ aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
aom_smooth_v_predictor_16x4_neon,
aom_smooth_h_predictor_16x4_neon)
-INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_16x64_neon,
- aom_smooth_predictor_16x64_neon,
+INTRA_PRED_TEST(NEON, TX_16X64, aom_dc_predictor_16x64_neon,
+ aom_dc_left_predictor_16x64_neon,
+ aom_dc_top_predictor_16x64_neon,
+ aom_dc_128_predictor_16x64_neon, nullptr, nullptr,
+ aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
aom_smooth_v_predictor_16x64_neon,
aom_smooth_h_predictor_16x64_neon)
#endif // HAVE_NEON
@@ -808,19 +821,24 @@
aom_smooth_predictor_32x32_neon,
aom_smooth_v_predictor_32x32_neon,
aom_smooth_h_predictor_32x32_neon)
-INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_32x16_neon,
- aom_smooth_predictor_32x16_neon,
+INTRA_PRED_TEST(NEON, TX_32X16, aom_dc_predictor_32x16_neon,
+ aom_dc_left_predictor_32x16_neon,
+ aom_dc_top_predictor_32x16_neon,
+ aom_dc_128_predictor_32x16_neon, nullptr, nullptr,
+ aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
aom_smooth_v_predictor_32x16_neon,
aom_smooth_h_predictor_32x16_neon)
-INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_32x64_neon,
- aom_smooth_predictor_32x64_neon,
+INTRA_PRED_TEST(NEON, TX_32X64, aom_dc_predictor_32x64_neon,
+ aom_dc_left_predictor_32x64_neon,
+ aom_dc_top_predictor_32x64_neon,
+ aom_dc_128_predictor_32x64_neon, nullptr, nullptr,
+ aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
aom_smooth_v_predictor_32x64_neon,
aom_smooth_h_predictor_32x64_neon)
-INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_32x8_neon,
- aom_smooth_predictor_32x8_neon,
+INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
+ aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
+ aom_dc_128_predictor_32x8_neon, nullptr, nullptr,
+ aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
aom_smooth_v_predictor_32x8_neon,
aom_smooth_h_predictor_32x8_neon)
#endif // HAVE_NEON
@@ -912,14 +930,18 @@
aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
aom_smooth_v_predictor_64x64_neon,
aom_smooth_h_predictor_64x64_neon)
-INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_64x32_neon,
- aom_smooth_predictor_64x32_neon,
+INTRA_PRED_TEST(NEON, TX_64X32, aom_dc_predictor_64x32_neon,
+ aom_dc_left_predictor_64x32_neon,
+ aom_dc_top_predictor_64x32_neon,
+ aom_dc_128_predictor_64x32_neon, nullptr, nullptr,
+ aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
aom_smooth_v_predictor_64x32_neon,
aom_smooth_h_predictor_64x32_neon)
-INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, nullptr, nullptr,
- nullptr, aom_paeth_predictor_64x16_neon,
- aom_smooth_predictor_64x16_neon,
+INTRA_PRED_TEST(NEON, TX_64X16, aom_dc_predictor_64x16_neon,
+ aom_dc_left_predictor_64x16_neon,
+ aom_dc_top_predictor_64x16_neon,
+ aom_dc_128_predictor_64x16_neon, nullptr, nullptr,
+ aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
aom_smooth_v_predictor_64x16_neon,
aom_smooth_h_predictor_64x16_neon)
#endif // HAVE_NEON