Optimize 8-tap Neon I8MM path for av1_convolve_y_sr
The 8-tap convolution filters used with av1_convolve_y_sr have all
even coefficient values. Halving these filter coefficients reduces the
intermediate precision requirements inside the convolution kernels,
and allows us to avoid some slower saturating narrow instructions.
Also update some variable naming to be consistent with the rest of the
file.
Change-Id: I2526ed7b9d8703e59499f63dd608ecb6bd95a17e
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index 9f58fae..a826316 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -551,7 +551,7 @@
sum = vusdotq_lane_s32(sum, s1, filters, 1);
// Further narrowing and packing is performed by the caller.
- return vqmovn_s32(sum);
+ return vmovn_s32(sum);
}
static inline uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
@@ -566,8 +566,9 @@
sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1);
// Narrow and re-pack.
- int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
- return vqrshrun_n_s16(sum, FILTER_BITS);
+ int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
static inline void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
@@ -575,7 +576,8 @@
uint8_t *dst_ptr,
int dst_stride, int w, int h,
const int16_t *y_filter_ptr) {
- const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter_ptr), 1);
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
@@ -593,14 +595,14 @@
transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
do {
- uint8x8_t s7, s8, s9, s10;
- load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+ uint8x8_t s7, s8, s9, sA;
+ load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &sA);
- uint8x16_t s4567, s5678, s6789, s78910;
- transpose_concat_elems_u8_4x4(s7, s8, s9, s10, &s78910);
+ uint8x16_t s4567, s5678, s6789, s789A;
+ transpose_concat_elems_u8_4x4(s7, s8, s9, sA, &s789A);
// Merge new data into block from previous iteration.
- uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+ uint8x16x2_t samples_LUT = { { s3456, s789A } };
s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
@@ -608,9 +610,10 @@
int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
- int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
- uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d3 = convolve8_4_y(s3456, s789A, filter);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
@@ -620,7 +623,7 @@
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
- s3456 = s78910;
+ s3456 = s789A;
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
@@ -647,20 +650,20 @@
transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
- uint8x8_t s7, s8, s9, s10;
- load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+ uint8x8_t s7, s8, s9, sA;
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &sA);
uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
- s78910_lo, s78910_hi;
- transpose_concat_elems_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+ s789A_lo, s789A_hi;
+ transpose_concat_elems_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
// Merge new data into block from previous iteration.
- uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
+ uint8x16x2_t samples_LUT_lo = { { s3456_lo, s789A_lo } };
s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
- uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
+ uint8x16x2_t samples_LUT_hi = { { s3456_hi, s789A_hi } };
s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
@@ -672,7 +675,7 @@
uint8x8_t d2 =
convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
uint8x8_t d3 =
- convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
+ convolve8_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -684,8 +687,8 @@
s1234_hi = s5678_hi;
s2345_lo = s6789_lo;
s2345_hi = s6789_hi;
- s3456_lo = s78910_lo;
- s3456_hi = s78910_hi;
+ s3456_lo = s789A_lo;
+ s3456_hi = s789A_hi;
s += 4 * src_stride;
d += 4 * dst_stride;