Add ARM Neon optimization of compound_diffwtd_mask_d16
Block size c/neon
8x4 5.04x
8x8 5.99x
16x8 6.37x
16x16 6.32x
32x16 3.14x
32x32 3.22x
64x32 3.01x
64x64 3.03x
128x64 2.57x
128x128 2.80x
Change-Id: I9d70e451c3d9ca365bb8879d10c62e726f519f49
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 4b8acd5..4b48418 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -292,6 +292,7 @@
"${AOM_ROOT}/av1/common/arm/transpose_neon.h"
"${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
"${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+ "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
"${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
"${AOM_ROOT}/av1/common/cdef_block_neon.c")
diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
new file mode 100644
index 0000000..44e0641
--- /dev/null
+++ b/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ assert(h >= 4);
+ assert(w >= 4);
+ assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+ const int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ uint16x8_t diff_q, tmp0, tmp1;
+ uint8x8_t diff_d, diff_select;
+ const CONV_BUF_TYPE *src0_1, *src1_1;
+ const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+ const uint8x8_t dup_38 = vdup_n_u8(38);
+ const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+ if (mask_type == DIFFWTD_38) {
+ diff_select = vdup_n_u8(255);
+ } else {
+ diff_select = vdup_n_u8(0);
+ }
+ if (w >= 8) {
+ for (int i = 0; i < h; ++i) {
+ src0_1 = src0;
+ src1_1 = src1;
+ for (int j = 0; j < w; j += 8) {
+ __builtin_prefetch(src0_1);
+ __builtin_prefetch(src1_1);
+ diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+ diff_q = vrshlq_u16(diff_q, dup_round);
+ diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+ diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+ diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+ vst1_u8(mask, diff_d);
+ src0_1 += 8;
+ src1_1 += 8;
+ mask += 8;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+ } else if (w == 4) {
+ for (int i = 0; i < h; i += 2) {
+ src0_1 = src0;
+ src1_1 = src1;
+ __builtin_prefetch(src0_1 + 0 * src0_stride);
+ __builtin_prefetch(src0_1 + 1 * src0_stride);
+ __builtin_prefetch(src1_1 + 0 * src1_stride);
+ __builtin_prefetch(src1_1 + 1 * src1_stride);
+ tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+ vld1_u16(src0_1 + (1 * src0_stride)));
+ tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+ vld1_u16(src1_1 + (1 * src1_stride)));
+ diff_q = vabdq_u16(tmp0, tmp1);
+ diff_q = vrshlq_u16(diff_q, dup_round);
+ diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+ diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+ diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+ vst1_u8(mask, diff_d);
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += w * 2;
+ }
+ }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 373e292..6aa9255 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -152,7 +152,7 @@
specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1/;
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
#
# Encoder functions below this point.
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index 2f4dc3a..4f74c81 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -48,13 +48,14 @@
typedef ::testing::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
BuildCompDiffwtdMaskD16Param;
+#if HAVE_SSE4_1 || HAVE_NEON
::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
buildcompdiffwtdmaskd16_func filter) {
return ::testing::Combine(::testing::Range(8, 13, 2),
::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-
+#endif
class BuildCompDiffwtdMaskD16Test
: public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
public:
@@ -157,7 +158,7 @@
printf("av1_build_compound_diffwtd_mask_d16 test_code %3dx%-3d: %7.2f us\n",
width, height, 1000.0 * elapsed_time1 / num_loops);
}
-
+#if HAVE_SSE4_1
void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed,
const DIFFWTD_MASK_TYPE type) {
const int width = block_size_wide[sb_type];
@@ -206,7 +207,7 @@
RunTest(GetParam(), 1, DIFFWTD_38);
RunTest(GetParam(), 1, DIFFWTD_38_INV);
}
-
+#endif
TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
RunCheckOutput(GET_PARAM(1));
}
@@ -225,4 +226,9 @@
BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, BuildCompDiffwtdMaskD16Test,
+ BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+#endif
+
} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 52fe380..8594d05 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -186,6 +186,7 @@
"${AOM_ROOT}/test/obmc_variance_test.cc"
"${AOM_ROOT}/test/sad_test.cc"
"${AOM_ROOT}/test/subtract_test.cc"
+ "${AOM_ROOT}/test/reconinter_test.cc"
"${AOM_ROOT}/test/sum_squares_test.cc"
"${AOM_ROOT}/test/variance_test.cc")
@@ -194,7 +195,6 @@
"${AOM_ROOT}/test/av1_quantize_test.cc"
"${AOM_ROOT}/test/corner_match_test.cc"
"${AOM_ROOT}/test/quantize_func_test.cc"
- "${AOM_ROOT}/test/reconinter_test.cc"
"${AOM_ROOT}/test/simd_cmp_sse4.cc")
if(HAVE_SSE4_1)