Count down in Neon subpel variance loops
Counting down and terminating on a zero comparison allows us
to use flag-setting arithmetic instructions, avoiding
an additional CMP instruction before the conditional branch.
This doesn't really affect performance but code size is reduced
by a useful amount since loop prologues are also shorter.
Change-Id: I014288f6cbb8b5033c13fd62d6fb16dbc8a78c6e
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 799b994..68d0936 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -26,7 +26,7 @@
const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
const uint8x8_t f1 = vdup_n_u8(filter_offset);
- int i = 0;
+ int i = dst_height;
do {
uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
@@ -37,8 +37,8 @@
src_ptr += 2 * src_stride;
dst_ptr += 2 * 4;
- i += 2;
- } while (i < dst_height);
+ i -= 2;
+ } while (i != 0);
}
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
@@ -47,7 +47,7 @@
const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
const uint8x8_t f1 = vdup_n_u8(filter_offset);
- int i = 0;
+ int i = dst_height;
do {
uint8x8_t s0 = vld1_u8(src_ptr);
uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
@@ -58,8 +58,7 @@
src_ptr += src_stride;
dst_ptr += 8;
- i++;
- } while (i < dst_height);
+ } while (--i != 0);
}
static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
@@ -69,7 +68,7 @@
const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
const uint8x8_t f1 = vdup_n_u8(filter_offset);
- int i = 0;
+ int i = dst_height;
do {
int j = 0;
do {
@@ -88,8 +87,7 @@
src_ptr += src_stride;
dst_ptr += dst_width;
- i++;
- } while (i < dst_height);
+ } while (--i != 0);
}
static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
@@ -127,7 +125,7 @@
// We only specialise on the filter values for large block sizes (>= 16x16.)
assert(dst_width >= 16 && dst_width % 16 == 0);
- int i = 0;
+ int i = dst_height;
do {
int j = 0;
do {
@@ -141,8 +139,7 @@
src_ptr += src_stride;
dst_ptr += dst_width;
- i++;
- } while (i < dst_height);
+ } while (--i != 0);
}
#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
@@ -257,7 +254,7 @@
const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
const uint8x8_t f1 = vdup_n_u8(filter_offset);
- int i = 0;
+ int i = dst_height;
do {
uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
@@ -273,8 +270,8 @@
src_ptr += 2 * src_stride;
dst_ptr += 2 * 4;
second_pred += 2 * 4;
- i += 2;
- } while (i < dst_height);
+ i -= 2;
+ } while (i != 0);
}
// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
@@ -286,7 +283,7 @@
const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
const uint8x8_t f1 = vdup_n_u8(filter_offset);
- int i = 0;
+ int i = dst_height;
do {
uint8x8_t s0 = vld1_u8(src_ptr);
uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
@@ -302,8 +299,7 @@
src_ptr += src_stride;
dst_ptr += 8;
second_pred += 8;
- i++;
- } while (i < dst_height);
+ } while (--i > 0);
}
// Combine bilinear filter with aom_comp_avg_pred for large blocks.
@@ -314,7 +310,7 @@
const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
const uint8x8_t f1 = vdup_n_u8(filter_offset);
- int i = 0;
+ int i = dst_height;
do {
int j = 0;
do {
@@ -338,8 +334,7 @@
src_ptr += src_stride;
dst_ptr += dst_width;
- i++;
- } while (i < dst_height);
+ } while (--i != 0);
}
// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
@@ -387,7 +382,7 @@
// We only specialise on the filter values for large block sizes (>= 16x16.)
assert(dst_width >= 16 && dst_width % 16 == 0);
- int i = 0;
+ int i = dst_height;
do {
int j = 0;
do {
@@ -406,8 +401,7 @@
src_ptr += src_stride;
dst_ptr += dst_width;
- i++;
- } while (i < dst_height);
+ } while (--i != 0);
}
// Implementation of aom_comp_avg_pred for blocks having width >= 16.
@@ -417,7 +411,7 @@
// We only specialise on the filter values for large block sizes (>= 16x16.)
assert(dst_width >= 16 && dst_width % 16 == 0);
- int i = 0;
+ int i = dst_height;
do {
int j = 0;
do {
@@ -434,8 +428,7 @@
src_ptr += src_stride;
dst_ptr += dst_width;
- i++;
- } while (i < dst_height);
+ } while (--i != 0);
}
#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \