Refactor Neon SAD4D functions
1) Re-roll loops for large block widths to avoid source duplication -
tightening things up before adding averaging SAD4D Neon functions
in a subsequent patch.
2) Change inner-loop instances of "i * stride" to increment a pointer
offset variable at the end of each loop iteration as some
compilers failed to recognize and simplify the strided increment
pattern.
3) As a result of 2), we also count down in outer loops and test
i != 0 for the termination condition.
Change-Id: I2eed2ce794248ae6f961ccc06cdc61bef9bd49bf
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index 467f44c..bc73fb8 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -26,67 +26,39 @@
*sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
}
-static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
+static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4], int w,
+ int h) {
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum[4];
- int i = 0;
+ int ref_offset = 0;
+ int i = h;
do {
- const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
- sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
- const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
- const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
- sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
- sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
- sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
- sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+ j += 32;
+ } while (j < w);
- const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
- sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
- sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
- sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
- sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
-
- const uint8x16_t s4 = vld1q_u8(src + i * src_stride + 64);
- sad16_neon(s4, vld1q_u8(ref[0] + i * ref_stride + 64), &sum_lo[0]);
- sad16_neon(s4, vld1q_u8(ref[1] + i * ref_stride + 64), &sum_lo[1]);
- sad16_neon(s4, vld1q_u8(ref[2] + i * ref_stride + 64), &sum_lo[2]);
- sad16_neon(s4, vld1q_u8(ref[3] + i * ref_stride + 64), &sum_lo[3]);
-
- const uint8x16_t s5 = vld1q_u8(src + i * src_stride + 80);
- sad16_neon(s5, vld1q_u8(ref[0] + i * ref_stride + 80), &sum_hi[0]);
- sad16_neon(s5, vld1q_u8(ref[1] + i * ref_stride + 80), &sum_hi[1]);
- sad16_neon(s5, vld1q_u8(ref[2] + i * ref_stride + 80), &sum_hi[2]);
- sad16_neon(s5, vld1q_u8(ref[3] + i * ref_stride + 80), &sum_hi[3]);
-
- const uint8x16_t s6 = vld1q_u8(src + i * src_stride + 96);
- sad16_neon(s6, vld1q_u8(ref[0] + i * ref_stride + 96), &sum_lo[0]);
- sad16_neon(s6, vld1q_u8(ref[1] + i * ref_stride + 96), &sum_lo[1]);
- sad16_neon(s6, vld1q_u8(ref[2] + i * ref_stride + 96), &sum_lo[2]);
- sad16_neon(s6, vld1q_u8(ref[3] + i * ref_stride + 96), &sum_lo[3]);
-
- const uint8x16_t s7 = vld1q_u8(src + i * src_stride + 112);
- sad16_neon(s7, vld1q_u8(ref[0] + i * ref_stride + 112), &sum_hi[0]);
- sad16_neon(s7, vld1q_u8(ref[1] + i * ref_stride + 112), &sum_hi[1]);
- sad16_neon(s7, vld1q_u8(ref[2] + i * ref_stride + 112), &sum_hi[2]);
- sad16_neon(s7, vld1q_u8(ref[3] + i * ref_stride + 112), &sum_hi[3]);
-
- i++;
- } while (i < h);
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
@@ -96,84 +68,22 @@
vst1q_u32(res, horizontal_add_4d_u32x4(sum));
}
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
- uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum[4];
-
- int i = 0;
- do {
- const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
- sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
-
- const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
-
- const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
- sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
- sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
- sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
- sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
-
- const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
- sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
- sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
- sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
- sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
-
- i++;
- } while (i < h);
-
- sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
- sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
- sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
- sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h);
}
static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
- uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- uint32x4_t sum[4];
-
- int i = 0;
- do {
- const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
- sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
-
- const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
-
- i++;
- } while (i < h);
-
- sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
- sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
- sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
- sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 32, h);
}
static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
@@ -182,16 +92,18 @@
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
- int i = 0;
+ int ref_offset = 0;
+ int i = h;
do {
- const uint8x16_t s = vld1q_u8(src + i * src_stride);
- sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
- sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
- sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
- sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
- i++;
- } while (i < h);
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
vst1q_u32(res, horizontal_add_4d_u32x4(sum));
}
@@ -204,13 +116,15 @@
*sad_sum = vpadalq_u8(*sad_sum, abs_diff);
}
-static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4], int h) {
+static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4], int w,
+ int h, int h_overflow) {
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
- int h_tmp = h > 32 ? 32 : h;
+ int h_limit = h > h_overflow ? h_overflow : h;
+ int ref_offset = 0;
int i = 0;
do {
uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
@@ -219,56 +133,26 @@
vdupq_n_u16(0) };
do {
- const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
- sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
- const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
- const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
- sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
- sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
- sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
- sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+ j += 32;
+ } while (j < w);
- const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
- sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
- sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
- sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
- sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
-
- const uint8x16_t s4 = vld1q_u8(src + i * src_stride + 64);
- sad16_neon(s4, vld1q_u8(ref[0] + i * ref_stride + 64), &sum_lo[0]);
- sad16_neon(s4, vld1q_u8(ref[1] + i * ref_stride + 64), &sum_lo[1]);
- sad16_neon(s4, vld1q_u8(ref[2] + i * ref_stride + 64), &sum_lo[2]);
- sad16_neon(s4, vld1q_u8(ref[3] + i * ref_stride + 64), &sum_lo[3]);
-
- const uint8x16_t s5 = vld1q_u8(src + i * src_stride + 80);
- sad16_neon(s5, vld1q_u8(ref[0] + i * ref_stride + 80), &sum_hi[0]);
- sad16_neon(s5, vld1q_u8(ref[1] + i * ref_stride + 80), &sum_hi[1]);
- sad16_neon(s5, vld1q_u8(ref[2] + i * ref_stride + 80), &sum_hi[2]);
- sad16_neon(s5, vld1q_u8(ref[3] + i * ref_stride + 80), &sum_hi[3]);
-
- const uint8x16_t s6 = vld1q_u8(src + i * src_stride + 96);
- sad16_neon(s6, vld1q_u8(ref[0] + i * ref_stride + 96), &sum_lo[0]);
- sad16_neon(s6, vld1q_u8(ref[1] + i * ref_stride + 96), &sum_lo[1]);
- sad16_neon(s6, vld1q_u8(ref[2] + i * ref_stride + 96), &sum_lo[2]);
- sad16_neon(s6, vld1q_u8(ref[3] + i * ref_stride + 96), &sum_lo[3]);
-
- const uint8x16_t s7 = vld1q_u8(src + i * src_stride + 112);
- sad16_neon(s7, vld1q_u8(ref[0] + i * ref_stride + 112), &sum_hi[0]);
- sad16_neon(s7, vld1q_u8(ref[1] + i * ref_stride + 112), &sum_hi[1]);
- sad16_neon(s7, vld1q_u8(ref[2] + i * ref_stride + 112), &sum_hi[2]);
- sad16_neon(s7, vld1q_u8(ref[3] + i * ref_stride + 112), &sum_hi[3]);
-
- i++;
- } while (i < h_tmp);
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (++i < h_limit);
sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
@@ -279,67 +163,22 @@
sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
- h_tmp += 32;
+ h_limit += h_overflow;
} while (i < h);
vst1q_u32(res, horizontal_add_4d_u32x4(sum));
}
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
- uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
- vdupq_n_u32(0) };
- int h_tmp = h > 64 ? 64 : h;
-
- int i = 0;
- do {
- uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
- vdupq_n_u16(0) };
- uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
- vdupq_n_u16(0) };
-
- do {
- const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
- sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
-
- const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
-
- const uint8x16_t s2 = vld1q_u8(src + i * src_stride + 32);
- sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
- sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
- sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
- sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
-
- const uint8x16_t s3 = vld1q_u8(src + i * src_stride + 48);
- sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
- sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
- sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
- sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
-
- i++;
- } while (i < h_tmp);
-
- sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
- sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
- sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
- sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
- sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
- sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
- sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
- sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
-
- h_tmp += 64;
- } while (i < h);
-
- vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
}
static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
@@ -350,22 +189,24 @@
uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
- int i = 0;
+ int ref_offset = 0;
+ int i = h;
do {
- const uint8x16_t s0 = vld1q_u8(src + i * src_stride);
- sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
- sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
- sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
- sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+ const uint8x16_t s0 = vld1q_u8(src);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
- const uint8x16_t s1 = vld1q_u8(src + i * src_stride + 16);
- sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
- sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
- sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
- sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+ const uint8x16_t s1 = vld1q_u8(src + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
- i++;
- } while (i < h);
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
}
@@ -377,16 +218,18 @@
vdupq_n_u16(0) };
uint32x4_t sum_u32[4];
- int i = 0;
+ int ref_offset = 0;
+ int i = h;
do {
- const uint8x16_t s = vld1q_u8(src + i * src_stride);
- sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum_u16[0]);
- sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum_u16[1]);
- sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum_u16[2]);
- sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum_u16[3]);
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
- i++;
- } while (i < h);
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
sum_u32[0] = vpaddlq_u16(sum_u16[0]);
sum_u32[1] = vpaddlq_u16(sum_u16[1]);
@@ -398,28 +241,30 @@
#endif // defined(__ARM_FEATURE_DOTPROD)
-static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
- uint16x8_t *const sad_sum) {
- uint8x8_t abs_diff = vabd_u8(src, ref);
- *sad_sum = vaddw_u8(*sad_sum, abs_diff);
-}
-
static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
- uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
- vdupq_n_u16(0) };
+ uint16x8_t sum[4];
- int i = 0;
+ uint8x8_t s = vld1_u8(src);
+ sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+ sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+ sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+ sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
+
+ src += src_stride;
+ int ref_offset = ref_stride;
+ int i = h - 1;
do {
- const uint8x8_t s = vld1_u8(src + i * src_stride);
- sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
- sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
- sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
- sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
+ s = vld1_u8(src);
+ sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+ sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+ sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+ sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
- i++;
- } while (i < h);
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
vst1q_u32(res, horizontal_add_4d_u16x8(sum));
}
@@ -427,24 +272,37 @@
static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
- uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
- vdupq_n_u16(0) };
+ uint16x8_t sum[4];
- int i = 0;
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
+
+ sum[0] = vabdl_u8(s, r0);
+ sum[1] = vabdl_u8(s, r1);
+ sum[2] = vabdl_u8(s, r2);
+ sum[3] = vabdl_u8(s, r3);
+
+ src += 2 * src_stride;
+ int ref_offset = 2 * ref_stride;
+ int i = (h - 1) / 2;
do {
- uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
- uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
- uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
- uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
- uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
+ s = load_unaligned_u8(src, src_stride);
+ r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
- sad8_neon(s, r0, &sum[0]);
- sad8_neon(s, r1, &sum[1]);
- sad8_neon(s, r2, &sum[2]);
- sad8_neon(s, r3, &sum[3]);
+ sum[0] = vabal_u8(sum[0], s, r0);
+ sum[1] = vabal_u8(sum[1], s, r1);
+ sum[2] = vabal_u8(sum[2], s, r2);
+ sum[3] = vabal_u8(sum[3], s, r3);
- i += 2;
- } while (i < h);
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ } while (--i != 0);
vst1q_u32(res, horizontal_add_4d_u16x8(sum));
}