Add Neon optimization for avg_8x8
Pulled over from vp9.
0.5% speed up overall.
Change-Id: Ife883b070fd923c1ce9c3cb0e6a43304dcde472c
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 471bff9..1e45597 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -244,7 +244,8 @@
"${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
"${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c")
+ "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/avg_neon.c")
list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
"${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 65a6ba9..8c51e7f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -857,10 +857,10 @@
# Avg
#
add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
- specialize qw/aom_avg_8x8 sse2/;
+ specialize qw/aom_avg_8x8 sse2 neon/;
add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
- specialize qw/aom_avg_4x4 sse2/;
+ specialize qw/aom_avg_4x4 sse2 neon/;
add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/aom_minmax_8x8 sse2/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
new file mode 100644
index 0000000..455f01d
--- /dev/null
+++ b/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+ const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+#if defined(__aarch64__)
+ const uint32_t d = vaddlvq_u16(c);
+ return d >> 4;
+#else
+ const uint32x2_t d = horizontal_add_u16x8(c);
+ return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+#endif
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ uint16x8_t sum;
+ uint32x2_t d;
+ uint8x8_t b = vld1_u8(a);
+ a += a_stride;
+ uint8x8_t c = vld1_u8(a);
+ a += a_stride;
+ sum = vaddl_u8(b, c);
+
+ for (int i = 0; i < 6; ++i) {
+ const uint8x8_t e = vld1_u8(a);
+ a += a_stride;
+ sum = vaddw_u8(sum, e);
+ }
+
+ d = horizontal_add_u16x8(sum);
+
+ return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+}
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
new file mode 100644
index 0000000..809e51c
--- /dev/null
+++ b/aom_dsp/arm/sum_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) {
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+}
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 74385a6..d94f6e5 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -13,25 +13,10 @@
#include "config/aom_dsp_rtcd.h"
#include "config/aom_config.h"
-
+#include "aom_dsp/arm/sum_neon.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
- const int32x4_t a = vpaddlq_s16(v_16x8);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
- const int64x2_t b = vpaddlq_s32(v_32x4);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
// w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, uint32_t *sse,
diff --git a/av1/common/arm/mem_neon.h b/av1/common/arm/mem_neon.h
index 10e8a16..171055f 100644
--- a/av1/common/arm/mem_neon.h
+++ b/av1/common/arm/mem_neon.h
@@ -316,6 +316,26 @@
*s3 = vld1q_s16(s);
}
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x4_t a_u32 = vdupq_n_u32(0);
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
uint32x2_t *tu0, uint32x2_t *tu1,
uint32x2_t *tu2, uint32x2_t *tu3) {