Simplify cost computation in cdef_find_dir_neon Use multiply-accumulate operations instead of zips and pairwise additions to perform the final sum of squares. This gives around 8% speedup for this function on both Clang and GCC. Change-Id: Ib4528d1e5621f9261a997e80161c1c5aba8e8247
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c index a070927..45d58aa 100644 --- a/av1/common/arm/cdef_block_neon.c +++ b/av1/common/arm/cdef_block_neon.c
@@ -74,19 +74,6 @@ } while (--height != 0); } -static INLINE uint32x4_t v128_madd_s16_neon(int16x8_t a, int16x8_t b) { - uint32x4_t t1 = - vreinterpretq_u32_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b))); - uint32x4_t t2 = - vreinterpretq_u32_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b))); -#if AOM_ARCH_AARCH64 - return vpaddq_u32(t1, t2); -#else - return vcombine_u32(vpadd_u32(vget_low_u32(t1), vget_high_u32(t1)), - vpadd_u32(vget_low_u32(t2), vget_high_u32(t2))); -#endif -} - // partial A is a 16-bit vector of the form: // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: // [0 y1 y2 y3 y4 y5 y6 y7]. @@ -97,8 +84,8 @@ int16x8_t partialb, uint32x4_t const1, uint32x4_t const2) { - int16x8_t tmp; // Reverse partial B. + // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. uint8x16_t pattern = vreinterpretq_u8_u64( vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); @@ -114,21 +101,18 @@ partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif - // Interleave the x and y values of identical indices and pair x8 with 0. - tmp = partiala; - partiala = vzipq_s16(partiala, partialb).val[0]; - partialb = vzipq_s16(tmp, partialb).val[1]; // Square and add the corresponding x and y values. - uint32x4_t partiala_u32 = v128_madd_s16_neon(partiala, partiala); - uint32x4_t partialb_u32 = v128_madd_s16_neon(partialb, partialb); + int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); + cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); + int32x4_t cost_hi = + vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); + cost_hi = + vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); // Multiply by constant. - partiala_u32 = vmulq_u32(partiala_u32, const1); - partialb_u32 = vmulq_u32(partialb_u32, const2); - - // Sum all results. - partiala_u32 = vaddq_u32(partiala_u32, partialb_u32); - return partiala_u32; + uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); + return cost; } // This function is called a first time to compute the cost along directions 4, @@ -227,11 +211,15 @@ vcreate_u64((uint64_t)105 << 32 | 105))); // Compute costs in terms of partial sums. + int32x4_t partial6_s32 = + vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); + partial6_s32 = + vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); + uint32x4_t costs[4]; costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); - costs[2] = v128_madd_s16_neon(partial6, partial6); - costs[2] = vmulq_n_u32(costs[2], 105); + costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); costs[0] = horizontal_add_4d_u32x4(costs);