Count down in Neon sum square loops

Counting down and terminating on a zero comparison allows us
to use flag-setting arithmetic instructions, avoiding
an additional CMP instruction before the conditional branch.
This doesn't really affect performance but code size is reduced
by a useful amount since loop prologues are also shorter.

Change-Id: Id623b28f804198ea6bf36361933bef8f17baec84
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index bf212a9..095a2c6 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -35,7 +35,7 @@
                                                        int stride, int height) {
   int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
-  int h = 0;
+  int h = height;
   do {
     int16x4_t s0 = vld1_s16(src + 0 * stride);
     int16x4_t s1 = vld1_s16(src + 1 * stride);
@@ -48,8 +48,8 @@
     sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3);
 
     src += 4 * stride;
-    h += 4;
-  } while (h < height);
+    h -= 4;
+  } while (h != 0);
 
   return horizontal_long_add_u32x4(
       vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
@@ -60,7 +60,7 @@
                                                        int height) {
   uint64x2_t sum_squares = vdupq_n_u64(0);
 
-  int h = 0;
+  int h = height;
   do {
     int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
     int w = 0;
@@ -86,8 +86,8 @@
         sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1])));
 
     src += 4 * stride;
-    h += 4;
-  } while (h < height);
+    h -= 4;
+  } while (h != 0);
 
   return horizontal_add_u64x2(sum_squares);
 }
@@ -134,7 +134,7 @@
   int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
   int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) };
 
-  int h = 0;
+  int h = height;
   do {
     int16x4_t s0 = vld1_s16(src + 0 * stride);
     int16x4_t s1 = vld1_s16(src + 1 * stride);
@@ -152,8 +152,8 @@
     sum_acc[1] = vpadal_s16(sum_acc[1], s3);
 
     src += 4 * stride;
-    h += 4;
-  } while (h < height);
+    h -= 4;
+  } while (h != 0);
 
   *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1]));
   return horizontal_long_add_u32x4(
@@ -166,7 +166,7 @@
   uint64x2_t sse = vdupq_n_u64(0);
   int32x4_t sum_acc = vdupq_n_s32(0);
 
-  int h = 0;
+  int h = height;
   do {
     int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
     int w = 0;
@@ -198,8 +198,8 @@
                       vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1])));
 
     src += 4 * stride;
-    h += 4;
-  } while (h < height);
+    h -= 4;
+  } while (h != 0);
 
   *sum += horizontal_add_s32x4(sum_acc);
   return horizontal_add_u64x2(sse);