Optimize narrowing sequences in Neon average helper functions
Only use saturating narrowing Neon instructions where necessary; also
remove redundant max(x, 0) sequences where vqmovun (saturating
convert to unsigned and narrow) will suffice.
Change-Id: I95d9d08177a7f518adda6761630f4c9801637de6
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 22caf83..6f4f58e 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -45,7 +45,7 @@
dst0 = vqrshlq_s32(dst0, round_bits_vec);
- tmp0 = vqmovn_s32(dst0);
+ tmp0 = vmovn_s32(dst0);
tmp4 = vcombine_s16(tmp0, tmp0);
*t0 = vqmovun_s16(tmp4);
@@ -57,7 +57,7 @@
tmp0 = vqrshl_s16(tmp0, round_bits_vec);
- tmp4 = vcombine_s16(tmp0, tmp0);
+ tmp4 = vcombine_s16(tmp0, vdup_n_s16(0));
*t0 = vqmovun_s16(tmp4);
}
@@ -67,7 +67,6 @@
uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
const uint16_t bck_offset, const int16x4_t sub_const,
const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
- int16x4_t tmp0, tmp2;
int16x8_t f0;
uint32x4_t sum0, sum2;
int32x4_t dst0, dst2;
@@ -92,10 +91,7 @@
dst0 = vqrshlq_s32(dst0, round_bits_vec);
dst2 = vqrshlq_s32(dst2, round_bits_vec);
- tmp0 = vqmovn_s32(dst0);
- tmp2 = vqmovn_s32(dst2);
-
- f0 = vcombine_s16(tmp0, tmp2);
+ f0 = vcombine_s16(vmovn_s32(dst0), vmovn_s32(dst2));
*t0 = vqmovun_s16(f0);
@@ -126,7 +122,6 @@
int32x4_t dst0, dst1, dst2, dst3;
int16x8_t tmp4, tmp5;
- const int16x8_t zero = vdupq_n_s16(0);
if (use_dist_wtd_comp_avg) {
const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
@@ -156,17 +151,11 @@
dst2 = vqrshlq_s32(dst2, round_bits_vec);
dst3 = vqrshlq_s32(dst3, round_bits_vec);
- tmp0 = vqmovn_s32(dst0);
- tmp1 = vqmovn_s32(dst1);
- tmp2 = vqmovn_s32(dst2);
- tmp3 = vqmovn_s32(dst3);
- tmp4 = vcombine_s16(tmp0, tmp1);
- tmp5 = vcombine_s16(tmp2, tmp3);
- tmp4 = vmaxq_s16(tmp4, zero);
- tmp5 = vmaxq_s16(tmp5, zero);
+ tmp4 = vcombine_s16(vmovn_s32(dst0), vmovn_s32(dst1));
+ tmp5 = vcombine_s16(vmovn_s32(dst2), vmovn_s32(dst3));
- *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
- *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+ *t0 = vqmovun_s16(tmp4);
+ *t1 = vqmovun_s16(tmp5);
} else {
const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
tmp_u0 = vhadd_u16(res0, d0);
@@ -186,11 +175,9 @@
tmp4 = vcombine_s16(tmp0, tmp1);
tmp5 = vcombine_s16(tmp2, tmp3);
- tmp4 = vmaxq_s16(tmp4, zero);
- tmp5 = vmaxq_s16(tmp5, zero);
- *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
- *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+ *t0 = vqmovun_s16(tmp4);
+ *t1 = vqmovun_s16(tmp5);
}
}
@@ -201,14 +188,12 @@
const int16x4_t sub_const, const int16_t round_bits,
const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
uint8x8_t *t2, uint8x8_t *t3) {
- int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int16x8_t f0, f1, f2, f3;
uint32x4_t sum0, sum1, sum2, sum3;
uint32x4_t sum4, sum5, sum6, sum7;
int32x4_t dst0, dst1, dst2, dst3;
int32x4_t dst4, dst5, dst6, dst7;
uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
- const int16x8_t zero = vdupq_n_s16(0);
if (use_dist_wtd_comp_avg) {
const int32x4_t sub_const_vec = vmovl_s16(sub_const);
@@ -260,29 +245,15 @@
dst6 = vqrshlq_s32(dst6, round_bits_vec);
dst7 = vqrshlq_s32(dst7, round_bits_vec);
- tmp0 = vqmovn_s32(dst0);
- tmp1 = vqmovn_s32(dst1);
- tmp2 = vqmovn_s32(dst2);
- tmp3 = vqmovn_s32(dst3);
- tmp4 = vqmovn_s32(dst4);
- tmp5 = vqmovn_s32(dst5);
- tmp6 = vqmovn_s32(dst6);
- tmp7 = vqmovn_s32(dst7);
+ f0 = vcombine_s16(vmovn_s32(dst0), vmovn_s32(dst2));
+ f1 = vcombine_s16(vmovn_s32(dst1), vmovn_s32(dst3));
+ f2 = vcombine_s16(vmovn_s32(dst4), vmovn_s32(dst6));
+ f3 = vcombine_s16(vmovn_s32(dst5), vmovn_s32(dst7));
- f0 = vcombine_s16(tmp0, tmp2);
- f1 = vcombine_s16(tmp1, tmp3);
- f2 = vcombine_s16(tmp4, tmp6);
- f3 = vcombine_s16(tmp5, tmp7);
-
- f0 = vmaxq_s16(f0, zero);
- f1 = vmaxq_s16(f1, zero);
- f2 = vmaxq_s16(f2, zero);
- f3 = vmaxq_s16(f3, zero);
-
- *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
- *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
- *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
- *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+ *t0 = vqmovun_s16(f0);
+ *t1 = vqmovun_s16(f1);
+ *t2 = vqmovun_s16(f2);
+ *t3 = vqmovun_s16(f3);
} else {
const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
@@ -303,15 +274,10 @@
f2 = vqrshlq_s16(f2, round_bits_vec);
f3 = vqrshlq_s16(f3, round_bits_vec);
- f0 = vmaxq_s16(f0, zero);
- f1 = vmaxq_s16(f1, zero);
- f2 = vmaxq_s16(f2, zero);
- f3 = vmaxq_s16(f3, zero);
-
- *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
- *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
- *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
- *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+ *t0 = vqmovun_s16(f0);
+ *t1 = vqmovun_s16(f1);
+ *t2 = vqmovun_s16(f2);
+ *t3 = vqmovun_s16(f3);
}
}