Replace 64-bit shifts with vextq in cdef_find_dir_neon cdef_find_dir_neon uses shifts over 64-bit elements to shuffle the data for each direction, which is rather inefficient. Replace all shifts with EXT instructions, which gives around 50% speedup for this function. Change-Id: I885e10e6dd199c9f9a33cb9a43ec9d45431212b6
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c index 24d4cf9..a070927 100644 --- a/av1/common/arm/cdef_block_neon.c +++ b/av1/common/arm/cdef_block_neon.c
@@ -74,92 +74,6 @@ } while (--height != 0); } -static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) { - return vreinterpretq_s16_s64(vcombine_s64(vcreate_s64(a), vcreate_s64(b))); -} - -#define SHL_HIGH_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - 0, vget_lane_u64(vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \ - (n - 8) * 8), \ - 0)); \ - } - -#define SHL_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - 0, vget_lane_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), 0)); \ - } - -#define SHL_LOW_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64( \ - vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), 0), \ - vget_lane_u64( \ - vorr_u64( \ - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \ - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \ - (8 - n) * 8)), \ - 0)); \ - } - -SHL_HIGH_NEON(14) -SHL_HIGH_NEON(12) -SHL_HIGH_NEON(10) -SHL_NEON(8) -SHL_LOW_NEON(6) -SHL_LOW_NEON(4) -SHL_LOW_NEON(2) - -#define v128_shl_n_byte_neon(a, n) v128_shl_##n##_byte_neon(a) - -#define SHR_HIGH_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64(vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \ - (n - 8) * 8), \ - 0), \ - 0); \ - } - -#define SHR_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), 0), 0); \ - } - -#define SHR_LOW_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64( \ - vorr_u64( \ - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), \ - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \ - (8 - n) * 8)), \ - 0), \ - vget_lane_u64( \ - vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \ - 0)); \ - } - -SHR_HIGH_NEON(14) -SHR_HIGH_NEON(12) -SHR_HIGH_NEON(10) -SHR_NEON(8) -SHR_LOW_NEON(6) -SHR_LOW_NEON(4) -SHR_LOW_NEON(2) - -#define v128_shr_n_byte_neon(a, n) v128_shr_##n##_byte_neon(a) - static INLINE uint32x4_t v128_madd_s16_neon(int16x8_t a, int16x8_t b) { uint32x4_t t1 = vreinterpretq_u32_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b))); @@ -217,57 +131,87 @@ return partiala_u32; } +// This function is called a first time to compute the cost along directions 4, +// 5, 6, 7, and then a second time on a rotated block to compute directions +// 0, 1, 2, 3. (0 means 45-degree up-right, 2 is horizontal, and so on.) +// +// For each direction the lines are shifted so that we can perform a +// basic sum on each vector element. For example, direction 5 is "south by +// southeast", so we need to add the pixels along each line i below: +// +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// For this to fit nicely in vectors, the lines need to be shifted like so: +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// In this configuration we can now perform SIMD additions to get the cost +// along direction 5. Since this won't fit into a single 128-bit vector, we use +// two of them to compute each half of the new configuration, and pad the empty +// spaces with zeros. Similar shifting is done for other directions, except +// direction 6 which is straightforward as it's the vertical direction. static INLINE uint32x4_t compute_directions_neon(int16x8_t lines[8], uint32_t cost[4]) { - int16x8_t partial4a, partial4b, partial5a, partial5b, partial6, partial7a, - partial7b; - int16x8_t tmp; + const int16x8_t zero = vdupq_n_s16(0); // Partial sums for lines 0 and 1. - partial4a = v128_shl_n_byte_neon(lines[0], 14); - partial4b = v128_shr_n_byte_neon(lines[0], 2); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[1], 12)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[1], 4)); - tmp = vaddq_s16(lines[0], lines[1]); - partial5a = v128_shl_n_byte_neon(tmp, 10); - partial5b = v128_shr_n_byte_neon(tmp, 6); - partial7a = v128_shl_n_byte_neon(tmp, 4); - partial7b = v128_shr_n_byte_neon(tmp, 12); - partial6 = tmp; + int16x8_t partial4a = vextq_s16(zero, lines[0], 1); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); + int16x8_t partial4b = vextq_s16(lines[0], zero, 1); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); + int16x8_t tmp = vaddq_s16(lines[0], lines[1]); + int16x8_t partial5a = vextq_s16(zero, tmp, 3); + int16x8_t partial5b = vextq_s16(tmp, zero, 3); + int16x8_t partial7a = vextq_s16(zero, tmp, 6); + int16x8_t partial7b = vextq_s16(tmp, zero, 6); + int16x8_t partial6 = tmp; // Partial sums for lines 2 and 3. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[2], 10)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[2], 6)); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[3], 8)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[3], 8)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); tmp = vaddq_s16(lines[2], lines[3]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 8)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 8)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 6)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 10)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 4 and 5. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[4], 6)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[4], 10)); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[5], 4)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[5], 12)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); tmp = vaddq_s16(lines[4], lines[5]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 6)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 10)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 8)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 8)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 6 and 7. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[6], 2)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[6], 14)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); partial4a = vaddq_s16(partial4a, lines[7]); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); tmp = vaddq_s16(lines[6], lines[7]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 4)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 12)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 10)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 6)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); partial6 = vaddq_s16(partial6, tmp); uint32x4_t const0 = vreinterpretq_u32_u64(