Count down in Neon sum square loops
Counting down and terminating on a zero comparison allows us
to use flag-setting arithmetic instructions, avoiding
an additional CMP instruction before the conditional branch.
This doesn't really affect performance but code size is reduced
by a useful amount since loop prologues are also shorter.
Change-Id: Id623b28f804198ea6bf36361933bef8f17baec84
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index bf212a9..095a2c6 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -35,7 +35,7 @@
int stride, int height) {
int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
- int h = 0;
+ int h = height;
do {
int16x4_t s0 = vld1_s16(src + 0 * stride);
int16x4_t s1 = vld1_s16(src + 1 * stride);
@@ -48,8 +48,8 @@
sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3);
src += 4 * stride;
- h += 4;
- } while (h < height);
+ h -= 4;
+ } while (h != 0);
return horizontal_long_add_u32x4(
vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
@@ -60,7 +60,7 @@
int height) {
uint64x2_t sum_squares = vdupq_n_u64(0);
- int h = 0;
+ int h = height;
do {
int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
int w = 0;
@@ -86,8 +86,8 @@
sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1])));
src += 4 * stride;
- h += 4;
- } while (h < height);
+ h -= 4;
+ } while (h != 0);
return horizontal_add_u64x2(sum_squares);
}
@@ -134,7 +134,7 @@
int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) };
- int h = 0;
+ int h = height;
do {
int16x4_t s0 = vld1_s16(src + 0 * stride);
int16x4_t s1 = vld1_s16(src + 1 * stride);
@@ -152,8 +152,8 @@
sum_acc[1] = vpadal_s16(sum_acc[1], s3);
src += 4 * stride;
- h += 4;
- } while (h < height);
+ h -= 4;
+ } while (h != 0);
*sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1]));
return horizontal_long_add_u32x4(
@@ -166,7 +166,7 @@
uint64x2_t sse = vdupq_n_u64(0);
int32x4_t sum_acc = vdupq_n_s32(0);
- int h = 0;
+ int h = height;
do {
int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
int w = 0;
@@ -198,8 +198,8 @@
vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1])));
src += 4 * stride;
- h += 4;
- } while (h < height);
+ h -= 4;
+ } while (h != 0);
*sum += horizontal_add_s32x4(sum_acc);
return horizontal_add_u64x2(sse);