modified neon version of sse.c
Change-Id: Ib936949e730e2f397b7750bc1ab353916b5b2f39
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 3195b65..16ceaba 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -277,7 +277,8 @@
"${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
"${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/avg_neon.c")
+ "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/sse_neon.c")
list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
"${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6b31d02..382347f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -92,6 +92,7 @@
specialize qw/aom_dc_top_predictor_16x4 sse2/;
specialize qw/aom_dc_top_predictor_16x8 sse2/;
specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+
specialize qw/aom_dc_top_predictor_16x32 sse2/;
specialize qw/aom_dc_top_predictor_16x64 sse2/;
specialize qw/aom_dc_top_predictor_32x8 sse2/;
@@ -597,7 +598,7 @@
specialize qw/aom_subtract_block neon msa sse2 avx2/;
add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
- specialize qw/aom_sse sse4_1 avx2/;
+ specialize qw/aom_sse sse4_1 avx2 neon/;
add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
specialize qw/aom_get_blk_sse_sum sse2 avx2/;
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
new file mode 100644
index 0000000..6f61b91
--- /dev/null
+++ b/aom_dsp/arm/sse_neon.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int addinc;
+ uint8x8_t d0, d1;
+ uint8_t dx;
+ uint32x2_t d2, d3;
+ uint8x16_t q0 = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+ uint32x4_t q8, q9;
+ uint16x8_t q1, q6, q7;
+ uint8x16_t q2, q3, q4, q5;
+ uint32_t sse = 0;
+ const uint16_t sse1 = 0;
+ q1 = vld1q_dup_u16(&sse1);
+ for (int y = 0; y < height; y++) {
+ int x = width;
+ while (x > 0) {
+ addinc = width - x;
+ q2 = vld1q_u8(a + addinc);
+ q3 = vld1q_u8(b + addinc);
+ if (x < 16) {
+ dx = x;
+ q4 = vld1q_dup_u8(&dx);
+ q5 = vcltq_u8(q0, q4);
+ q2 = vandq_u8(q2, q5);
+ q3 = vandq_u8(q3, q5);
+ }
+ q4 = vabdq_u8(q2, q3); // diff = abs(a[x] - b[x])
+ d0 = vget_low_u8(q4);
+ d1 = vget_high_u8(q4);
+ q6 = vmlal_u8(q1, d0, d0);
+ q7 = vmlal_u8(q1, d1, d1);
+ q8 = vaddl_u16(vget_low_u16(q6), vget_high_u16(q6));
+ q9 = vaddl_u16(vget_low_u16(q7), vget_high_u16(q7));
+
+ d2 = vadd_u32(vget_low_u32(q8), vget_high_u32(q8));
+ d3 = vadd_u32(vget_low_u32(q9), vget_high_u32(q9));
+ sse += vget_lane_u32(d2, 0);
+ sse += vget_lane_u32(d2, 1);
+ sse += vget_lane_u32(d3, 0);
+ sse += vget_lane_u32(d3, 1);
+ x -= 16;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return (int64_t)sse;
+}
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 30e1311..4644e71 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -385,6 +385,15 @@
RunTest(1, width_, height, 100);
}
}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+ TestSSEFuncs(&aom_sse_c, &aom_sse_neon),
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+ Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif // HAVE_NEON
+
#if HAVE_SSE4_1
TestSSEFuncs sse_sse4[] = {
TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),