sum_neon.h: gather horizontal_add_*() functions
this allows for consistent use of vaddv with aarch64
Bug: b/217282899
Bug: b/231719821
Change-Id: I16de3905c72aa79837fbabc01b1e7ea281792e89
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 42133b8..b295d71 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -17,6 +17,15 @@
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
+#if !defined(__aarch64__)
+static INLINE uint32x2_t horizontal_add_u16x8_v(const uint16x8_t a) {
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+}
+#endif
+
unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
const uint8x16_t b = load_unaligned_u8q(a, a_stride);
const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
@@ -24,14 +33,13 @@
const uint32_t d = vaddlvq_u16(c);
return (d + 8) >> 4;
#else
- const uint32x2_t d = horizontal_add_u16x8(c);
+ const uint32x2_t d = horizontal_add_u16x8_v(c);
return vget_lane_u32(vrshr_n_u32(d, 4), 0);
#endif
}
unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
uint16x8_t sum;
- uint32x2_t d;
uint8x8_t b = vld1_u8(a);
a += a_stride;
uint8x8_t c = vld1_u8(a);
@@ -44,9 +52,13 @@
sum = vaddw_u8(sum, e);
}
- d = horizontal_add_u16x8(sum);
-
+#if defined(__aarch64__)
+ const uint32_t d = vaddlvq_u16(sum);
+ return (d + 32) >> 6;
+#else
+ const uint32x2_t d = horizontal_add_u16x8_v(sum);
return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+#endif
}
void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
@@ -157,11 +169,7 @@
} while (length != 0);
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-#ifdef __aarch64__
- return vaddvq_s32(accum);
-#else
return horizontal_add_s32x4(accum);
-#endif // __aarch64__
}
int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
@@ -186,13 +194,8 @@
v_sse = vmlal_s16(v_sse, v_high, v_high);
#endif
}
-#if defined(__aarch64__)
- int mean = vaddvq_s32(v_mean);
- int sse = (int)vaddvq_s32(v_sse);
-#else
int mean = horizontal_add_s32x4(v_mean);
int sse = horizontal_add_s32x4(v_sse);
-#endif
// (mean * mean): dynamic range 31 bits.
int var = sse - ((mean * mean) >> (bwl + 2));
return var;
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index 22f2e64..b62628e 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -15,19 +15,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo =
- vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi =
- vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
- const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
+#include "aom_dsp/arm/sum_neon.h"
// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
@@ -120,10 +108,10 @@
ref3 += ref_stride;
}
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+ res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
@@ -164,10 +152,10 @@
ref3 += ref_stride;
}
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+ res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
@@ -219,24 +207,10 @@
ref3 += ref_stride;
}
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-static INLINE unsigned int horizontal_add_16x4(const uint16x4_t vec_16x4) {
- const uint32x2_t a = vpaddl_u16(vec_16x4);
- const uint64x1_t b = vpaddl_u32(a);
- return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-}
-
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
- const uint32x4_t a = vpaddlq_u16(vec_16x8);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
+ res[0] = horizontal_long_add_u16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_u16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_u16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_u16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
static void sad_row4_neon(uint16x4_t *vec_src, const uint8x8_t q0,
@@ -330,10 +304,10 @@
sad_row4_neon(&q2, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q6));
sad_row4_neon(&q3, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q7));
- res[0] += horizontal_add_16x4(q0);
- res[1] += horizontal_add_16x4(q1);
- res[2] += horizontal_add_16x4(q2);
- res[3] += horizontal_add_16x4(q3);
+ res[0] += horizontal_add_u16x4(q0);
+ res[1] += horizontal_add_u16x4(q1);
+ res[2] += horizontal_add_u16x4(q2);
+ res[3] += horizontal_add_u16x4(q3);
}
break;
}
@@ -357,10 +331,10 @@
ref2 += ref_stride;
ref3 += ref_stride;
- res[0] += horizontal_add_16x4(q0);
- res[1] += horizontal_add_16x4(q1);
- res[2] += horizontal_add_16x4(q2);
- res[3] += horizontal_add_16x4(q3);
+ res[0] += horizontal_add_u16x4(q0);
+ res[1] += horizontal_add_u16x4(q1);
+ res[2] += horizontal_add_u16x4(q2);
+ res[3] += horizontal_add_u16x4(q3);
}
break;
}
@@ -384,10 +358,10 @@
ref2 += ref_stride;
ref3 += ref_stride;
- res[0] += horizontal_add_16x8(q0);
- res[1] += horizontal_add_16x8(q1);
- res[2] += horizontal_add_16x8(q2);
- res[3] += horizontal_add_16x8(q3);
+ res[0] += horizontal_add_u16x8(q0);
+ res[1] += horizontal_add_u16x8(q1);
+ res[2] += horizontal_add_u16x8(q2);
+ res[3] += horizontal_add_u16x8(q3);
}
break;
}
@@ -418,10 +392,10 @@
ref2 += ref_stride;
ref3 += ref_stride;
- res[0] += horizontal_add_16x8(q0);
- res[1] += horizontal_add_16x8(q1);
- res[2] += horizontal_add_16x8(q2);
- res[3] += horizontal_add_16x8(q3);
+ res[0] += horizontal_add_u16x8(q0);
+ res[1] += horizontal_add_u16x8(q1);
+ res[2] += horizontal_add_u16x8(q2);
+ res[3] += horizontal_add_u16x8(q3);
}
break;
}
@@ -466,10 +440,10 @@
ref2 += ref_stride;
ref3 += ref_stride;
- res[0] += horizontal_add_16x8(q0);
- res[1] += horizontal_add_16x8(q1);
- res[2] += horizontal_add_16x8(q2);
- res[3] += horizontal_add_16x8(q3);
+ res[0] += horizontal_add_u16x8(q0);
+ res[1] += horizontal_add_u16x8(q1);
+ res[2] += horizontal_add_u16x8(q2);
+ res[3] += horizontal_add_u16x8(q3);
}
break;
}
@@ -542,10 +516,10 @@
ref2 += ref_stride;
ref3 += ref_stride;
- res[0] += horizontal_add_16x8(q0);
- res[1] += horizontal_add_16x8(q1);
- res[2] += horizontal_add_16x8(q2);
- res[3] += horizontal_add_16x8(q3);
+ res[0] += horizontal_add_u16x8(q0);
+ res[1] += horizontal_add_u16x8(q1);
+ res[2] += horizontal_add_u16x8(q2);
+ res[3] += horizontal_add_u16x8(q3);
}
}
}
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 4f0a199..acd2c54 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -13,6 +13,7 @@
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
@@ -107,26 +108,6 @@
return vget_lane_u32(d5, 0);
}
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo =
- vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi =
- vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
- const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
- const uint32x4_t a = vpaddlq_u16(vec_16x8);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-
unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
@@ -160,7 +141,7 @@
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
vget_high_u8(vec_ref_48));
}
- return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+ return horizontal_long_add_u16x8(vec_accum_lo, vec_accum_hi);
}
unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
@@ -256,7 +237,7 @@
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
vget_high_u8(vec_ref_16));
}
- return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+ return horizontal_add_u16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
@@ -275,7 +256,7 @@
vec_accum_hi =
vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
}
- return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+ return horizontal_add_u16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
@@ -290,7 +271,7 @@
ref += ref_stride;
vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
}
- return horizontal_add_16x8(vec_accum);
+ return horizontal_add_u16x8(vec_accum);
}
static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -343,7 +324,7 @@
src_ptr += src_stride;
ref_ptr += ref_stride;
- sum += horizontal_add_16x8(q3);
+ sum += horizontal_add_u16x8(q3);
}
return sum;
@@ -379,7 +360,7 @@
src_ptr += src_stride;
ref_ptr += ref_stride;
- sum += horizontal_add_16x8(q3);
+ sum += horizontal_add_u16x8(q3);
}
return sum;
@@ -402,7 +383,7 @@
q2 = vabdq_u8(q0, q1);
q3 = vpadalq_u8(q3, q2);
- sum += horizontal_add_16x8(q3);
+ sum += horizontal_add_u16x8(q3);
src_ptr += src_stride;
ref_ptr += ref_stride;
@@ -447,7 +428,7 @@
ref_ptr += ref_stride;
q3 = vabal_u8(q3, q0, q1);
}
- return horizontal_add_16x8(q3);
+ return horizontal_add_u16x8(q3);
}
static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
@@ -474,7 +455,7 @@
q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1));
}
- return horizontal_add_16x8(q3);
+ return horizontal_add_u16x8(q3);
}
#define FSADS128_H(h) \
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index 35b784a..a69dfb5 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -63,11 +63,7 @@
b += b_stride << 1;
y += 2;
} while (y < height);
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
case 8:
do {
@@ -76,11 +72,7 @@
b += b_stride;
y += 1;
} while (y < height);
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
case 16:
do {
@@ -89,11 +81,7 @@
b += b_stride;
y += 1;
} while (y < height);
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
case 32:
do {
@@ -103,11 +91,7 @@
b += b_stride;
y += 1;
} while (y < height);
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
case 64:
do {
@@ -119,11 +103,7 @@
b += b_stride;
y += 1;
} while (y < height);
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
case 128:
do {
@@ -139,11 +119,7 @@
b += b_stride;
y += 1;
} while (y < height);
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
default:
if (width & 0x07) {
@@ -171,11 +147,7 @@
y += 1;
} while (y < height);
}
-#if defined(__aarch64__)
- sse = vaddvq_u32(sum);
-#else
sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif // __aarch64__
break;
}
return sse;
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index 809e51c..a118f3c 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -14,24 +14,64 @@
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
- const int32x4_t a = vpaddlq_s16(v_16x8);
+static INLINE int horizontal_add_s16x8(const int16x8_t a) {
+#if defined(__aarch64__)
+ return vaddlvq_s16(a);
+#else
+ const int32x4_t b = vpaddlq_s16(a);
+ const int64x2_t c = vpaddlq_s32(b);
+ const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+ vreinterpret_s32_s64(vget_high_s64(c)));
+ return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t a) {
+#if defined(__aarch64__)
+ return vaddvq_s32(a);
+#else
const int64x2_t b = vpaddlq_s32(a);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
+#endif
}
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
- const int64x2_t b = vpaddlq_s32(v_32x4);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
+static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+#if defined(__aarch64__)
+ return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
}
-static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) {
+static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
+#if defined(__aarch64__)
+ return vaddlvq_u16(a);
+#else
const uint32x4_t b = vpaddlq_u16(a);
const uint64x2_t c = vpaddlq_u32(b);
- return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
- vreinterpret_u32_u64(vget_high_u64(c)));
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
+#if defined(__aarch64__)
+ return vaddlv_u16(a);
+#else
+ const uint32x2_t b = vpaddl_u16(a);
+ const uint64x1_t c = vpaddl_u32(b);
+ return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif
}
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index e840f13..3378491 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -475,13 +475,8 @@
b += 4 * b_stride;
}
-#if defined(__aarch64__)
- *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
- *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
*sum = horizontal_add_s16x8(sum_s16);
*sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
}
// Process a block of any size where the width is divisible by 16.
@@ -524,13 +519,8 @@
b += b_stride;
}
-#if defined(__aarch64__)
- *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
- *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
*sum = horizontal_add_s16x8(sum_s16);
*sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
}
// Process a block of width 8 two rows at a time.
@@ -568,13 +558,8 @@
i += 2;
} while (i < h);
-#if defined(__aarch64__)
- *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
- *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
*sum = horizontal_add_s16x8(sum_s16);
*sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
}
#define VARIANCE_NXM(n, m, shift) \
@@ -635,11 +620,10 @@
v_diff = vpadalq_s16(v_diff, sum_s16);
v_sse = vpadalq_s32(v_sse, sse_s32);
}
+ int diff = horizontal_add_s32x4(v_diff);
#if defined(__aarch64__)
- int diff = vaddvq_s32(v_diff);
uint32_t sq = (uint32_t)vaddvq_u64(vreinterpretq_u64_s64(v_sse));
#else
- int diff = horizontal_add_s32x4(v_diff);
uint32_t sq = vget_lane_u32(
vreinterpret_u32_s64(vadd_s64(vget_low_s64(v_sse), vget_high_s64(v_sse))),
0);
diff --git a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
index ad81f40..3528105 100644
--- a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
+++ b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -24,6 +24,9 @@
// Compute the sum of all pixel differences of this MB.
static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if defined(__aarch64__)
+ return vaddlvq_s8(v_sum_diff_total);
+#else
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
@@ -31,6 +34,7 @@
vget_low_s64(fedcba98_76543210));
const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
return sum_diff;
+#endif
}
// Denoise a 16x1 vector.
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/neon/quantize_neon.c
index 8b5888f..0a87503 100644
--- a/av1/encoder/arm/neon/quantize_neon.c
+++ b/av1/encoder/arm/neon/quantize_neon.c
@@ -15,6 +15,7 @@
#include <math.h>
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
#include "aom_mem/aom_mem.h"
#include "av1/common/quant_common.h"
@@ -206,17 +207,6 @@
return v_nz_mask;
}
-static INLINE uint32_t sum_abs_coeff(const uint16x8_t a) {
-#if defined(__aarch64__)
- return vaddvq_u16(a);
-#else
- const uint32x4_t b = vpaddlq_u16(a);
- const uint64x2_t c = vpaddlq_u32(b);
- const uint64x1_t d = vadd_u64(vget_low_u64(c), vget_high_u64(c));
- return (uint32_t)vget_lane_u64(d, 0);
-#endif
-}
-
static void quantize_fp_no_qmatrix_neon(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -246,7 +236,7 @@
const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
// If the coefficient is in the base ZBIN range, then discard.
- if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+ if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
non_zero_count -= 16;
} else {
break;