Remove no longer needed special case in Neon convolutions The neon_dotprod and neon_i8mm implementations of some convolution functions have a special case to handle the no-op filter, as 128 does not fit in a signed 8-bit integer. This no-op filter is never used in practice - the unit tests have been updated to reflect this in a previous patch - so remove the code to handle the no-op filter. Change-Id: Ifd43a4f752a6cb9de8bbfbe7b4577764a4208d7e
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c index b558744..9c50890 100644 --- a/av1/common/arm/convolve_neon_dotprod.c +++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -105,6 +105,9 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { + // The no-op filter should never be used here. + assert(x_filter_ptr[5] != 128); + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); @@ -120,89 +123,60 @@ const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(filter_0_7, 5) == 128) { - // Undo the horizontal offset in the calling function. - src += 5; + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + int16x4_t d0 = + convolve12_4_x(s0, filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve12_4_x(s1, filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve12_4_x(s2, filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve12_4_x(s3, filter, correction, range_limit, permute_tbl); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0); - } else { - vst1_u8(d, d0); - } + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = + convolve12_8_x(s0, filter, correction, range_limit, permute_tbl); + uint8x8_t d1 = + convolve12_8_x(s1, filter, correction, range_limit, permute_tbl); + uint8x8_t d2 = + convolve12_8_x(s2, filter, correction, range_limit, permute_tbl); + uint8x8_t d3 = + convolve12_8_x(s3, filter, correction, range_limit, permute_tbl); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; - } while (width > 0); - src += src_stride; - dst += dst_stride; - } while (--h != 0); - } else { - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = - convolve12_4_x(s0, filter, correction, range_limit, permute_tbl); - int16x4_t d1 = - convolve12_4_x(s1, filter, correction, range_limit, permute_tbl); - int16x4_t d2 = - convolve12_4_x(s2, filter, correction, range_limit, permute_tbl); - int16x4_t d3 = - convolve12_4_x(s3, filter, correction, range_limit, permute_tbl); - - uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); - uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - - store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); - store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - - dst += 4 * dst_stride; - src += 4 * src_stride; - h -= 4; - } while (h != 0); - } else { - do { - const uint8_t *s = src; - uint8_t *d = dst; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - uint8x8_t d0 = - convolve12_8_x(s0, filter, correction, range_limit, permute_tbl); - uint8x8_t d1 = - convolve12_8_x(s1, filter, correction, range_limit, permute_tbl); - uint8x8_t d2 = - convolve12_8_x(s2, filter, correction, range_limit, permute_tbl); - uint8x8_t d3 = - convolve12_8_x(s3, filter, correction, range_limit, permute_tbl); - - store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); } } @@ -520,44 +494,100 @@ static INLINE void convolve_y_sr_12tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (y_filter_ptr[5] == 128) { - // Undo the vertical offset in the calling function. - src_ptr += 5 * src_stride; + // The no-op filter should never be used here. + assert(y_filter_ptr[5] != 128); + + const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); + const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; + load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, + &t8, &t9, &tA); + src_ptr += 11 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); + + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_4x4(s4, s5, s6, s7, &s4567); + transpose_concat_4x4(s5, s6, s7, s8, &s5678); + transpose_concat_4x4(s6, s7, s8, s9, &s6789); + transpose_concat_4x4(s7, s8, s9, sA, &s789A); do { + uint8x8_t tB, tC, tD, tE; + load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE); + + int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); + int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); + int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); + int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); + + int8x16_t s89AB, s9ABC, sABCD, sBCDE; + transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s789A, sBCDE } }; + s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = + convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); + int16x4_t d1 = + convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); + int16x4_t d2 = + convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); + int16x4_t d3 = + convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s789A; + s4567 = s89AB; + s5678 = s9ABC; + s6789 = sABCD; + s789A = sBCDE; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; - int width = w; - do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0); - } else { - vst1_u8(d, d0); - } - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); - const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); - - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); - - if (w == 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; - load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, - &t8, &t9, &tA); - src_ptr += 11 * src_stride; + load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, + &t9, &tA); + s += 11 * src_stride; // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); @@ -572,169 +602,87 @@ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); - transpose_concat_4x4(s4, s5, s6, s7, &s4567); - transpose_concat_4x4(s5, s6, s7, s8, &s5678); - transpose_concat_4x4(s6, s7, s8, s9, &s6789); - transpose_concat_4x4(s7, s8, s9, sA, &s789A); + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s789A_lo, s789A_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { uint8x8_t tB, tC, tD, tE; - load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE); + load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); - int8x16_t s89AB, s9ABC, sABCD, sBCDE; - transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); + int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, + sBCDE_lo, sBCDE_hi; + transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); // Merge new data into block from previous iteration. - int8x16x2_t samples_LUT = { { s789A, sBCDE } }; - s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; + s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); + s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); + sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); - int16x4_t d0 = - convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); - int16x4_t d1 = - convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); - int16x4_t d2 = - convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); - int16x4_t d3 = - convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; + s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); + s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); + sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); - store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); - store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + uint8x8_t d0 = + convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, + s89AB_hi, filter_0_7, filter_4_11); + uint8x8_t d1 = + convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, + s9ABC_hi, filter_0_7, filter_4_11); + uint8x8_t d2 = + convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, + sABCD_hi, filter_0_7, filter_4_11); + uint8x8_t d3 = + convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, + sBCDE_hi, filter_0_7, filter_4_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s789A; - s4567 = s89AB; - s5678 = s9ABC; - s6789 = sABCD; - s789A = sBCDE; + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s789A_lo; + s3456_hi = s789A_hi; + s4567_lo = s89AB_lo; + s4567_hi = s89AB_hi; + s5678_lo = s9ABC_lo; + s5678_hi = s9ABC_hi; + s6789_lo = sABCD_lo; + s6789_hi = sABCD_hi; + s789A_lo = sBCDE_lo; + s789A_hi = sBCDE_hi; - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - do { - int height = h; - const uint8_t *s = src_ptr; - uint8_t *d = dst_ptr; - - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; - load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, - &t9, &tA); - s += 11 * src_stride; - - // Transform sample range to [-128, 127] for 8-bit signed dot product. - int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); - int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); - int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); - int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); - int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); - int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); - int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); - int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); - int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); - int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); - int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); - - // This operation combines a conventional transpose and the sample - // permute (see horizontal case) required before computing the dot - // product. - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, - s6789_lo, s6789_hi, s789A_lo, s789A_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); - transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); - - do { - uint8x8_t tB, tC, tD, tE; - load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); - - int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); - int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); - int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); - int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); - - int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, - sBCDE_lo, sBCDE_hi; - transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); - - // Merge new data into block from previous iteration. - int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; - s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); - s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); - sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); - - int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; - s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); - s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); - sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); - - uint8x8_t d0 = - convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, - s89AB_hi, filter_0_7, filter_4_11); - uint8x8_t d1 = - convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, - s9ABC_hi, filter_0_7, filter_4_11); - uint8x8_t d2 = - convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, - sABCD_hi, filter_0_7, filter_4_11); - uint8x8_t d3 = - convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, - sBCDE_hi, filter_0_7, filter_4_11); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - // Prepare block for next iteration - re-using as much as possible. - // Shuffle everything up four rows. - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s789A_lo; - s3456_hi = s789A_hi; - s4567_lo = s89AB_lo; - s4567_hi = s89AB_hi; - s5678_lo = s9ABC_lo; - s5678_hi = s9ABC_hi; - s6789_lo = sABCD_lo; - s6789_hi = sABCD_hi; - s789A_lo = sBCDE_lo; - s789A_hi = sBCDE_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src_ptr += 8; - dst_ptr += 8; - w -= 8; - } while (w != 0); - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); } } @@ -1026,15 +974,88 @@ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11) { + // The no-op filter should never be used here. + assert(vgetq_lane_s16(x_filter_0_7, 5) != 128); + const int bd = 8; - // Special case the following no-op filter as 128 won't fit into the 8-bit - // signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { - const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); - // Undo the horizontal offset in the calling function. - src_ptr += 5; + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Dot product constants. + const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const); + const uint8x16_t range_limit = vdupq_n_u8(128); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve12_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve12_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve12_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = + convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); do { const uint8_t *s = src_ptr; @@ -1042,123 +1063,20 @@ int width = w; do { - uint8x8_t s0 = vld1_u8(s); - uint16x8_t d0 = vaddw_u8(horiz_const, s0); - d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); - // Store 8 elements to avoid additional branches. This is safe if the - // actual block width is < 8 because the intermediate buffer is large - // enough to accommodate 128x128 blocks. - vst1q_s16(d, vreinterpretq_s16_u16(d0)); + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + vst1q_s16(d, d0); - d += 8; s += 8; + d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); - - } else { - // Narrow filter values to 8-bit. - const int16x8x2_t x_filter_s16 = { - { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } - }; - const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), - vmovn_s16(x_filter_s16.val[1])); - - // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding - // shifts - which are generally faster than rounding shifts on modern CPUs. - const int32_t horiz_const = - ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); - // Dot product constants. - const int32x4_t correction = - vdupq_n_s32((128 << FILTER_BITS) + horiz_const); - const uint8x16_t range_limit = vdupq_n_u8(128); - const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); - - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, - permute_tbl); - int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit, - permute_tbl); - int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit, - permute_tbl); - int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit, - permute_tbl); - - store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); - - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - uint8x16_t s0 = vld1q_u8(src_ptr); - int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, - permute_tbl); - vst1_s16(dst_ptr, d0); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - - } else { - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, - range_limit, permute_tbl); - int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, - range_limit, permute_tbl); - int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, - range_limit, permute_tbl); - int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, - range_limit, permute_tbl); - - store_s16_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2]; - s0[0] = vld1q_u8(s); - s0[1] = vld1q_u8(s + 4); - int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, - range_limit, permute_tbl); - vst1q_s16(d, d0); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } } }
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c index b2f489f..7ba8b66 100644 --- a/av1/common/arm/convolve_neon_i8mm.c +++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -95,94 +95,68 @@ int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { + // The no-op filter should never be used here. + assert(x_filter_ptr[5] != 128); + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(filter_0_7, 5) == 128) { - // Undo the horizontal offset in the calling function. - src += 5; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0); - } else { - vst1_u8(d, d0); - } + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; - } while (width > 0); - src += src_stride; - dst += dst_stride; - } while (--h != 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding - // right shift by FILTER_BITS - instead of a first rounding right shift by - // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - - // ROUND0_BITS. - const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); - - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); - int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); - int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); - int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); - - uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); - uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - - store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); - store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - - dst += 4 * dst_stride; - src += 4 * src_stride; - h -= 4; - } while (h != 0); - } else { - do { - const uint8_t *s = src; - uint8_t *d = dst; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); - uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); - uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); - uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); - - store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); } } @@ -470,187 +444,161 @@ uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (y_filter_ptr[5] == 128) { - // Undo the vertical offset in the calling function. - src_ptr += 5 * src_stride; + // The no-op filter should never be used here. + assert(y_filter_ptr[5] != 128); + + const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); + const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, + &s8, &s9, &sA); + src_ptr += 11 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); + transpose_concat_4x4(s4, s5, s6, s7, &s4567); + transpose_concat_4x4(s5, s6, s7, s8, &s5678); + transpose_concat_4x4(s6, s7, s8, s9, &s6789); + transpose_concat_4x4(s7, s8, s9, sA, &s789A); do { + uint8x8_t sB, sC, sD, sE; + load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE); + + uint8x16_t s89AB, s9ABC, sABCD, sBCDE; + transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s789A, sBCDE } }; + s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = + convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); + int16x4_t d1 = + convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); + int16x4_t d2 = + convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); + int16x4_t d3 = + convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s789A; + s4567 = s89AB; + s5678 = s9ABC; + s6789 = sABCD; + s789A = sBCDE; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; - int width = w; - do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0); - } else { - vst1_u8(d, d0); - } - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); - const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); - - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); - - if (w == 4) { uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; - load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, - &s8, &s9, &sA); - src_ptr += 11 * src_stride; + load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; - // This operation combines a conventional transpose and the sample permute - // (see horizontal case) required before computing the dot product. - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); - transpose_concat_4x4(s4, s5, s6, s7, &s4567); - transpose_concat_4x4(s5, s6, s7, s8, &s5678); - transpose_concat_4x4(s6, s7, s8, s9, &s6789); - transpose_concat_4x4(s7, s8, s9, sA, &s789A); + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s789A_lo, s789A_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { uint8x8_t sB, sC, sD, sE; - load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE); + load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE); - uint8x16_t s89AB, s9ABC, sABCD, sBCDE; - transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); + uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, + sBCDE_lo, sBCDE_hi; + transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); // Merge new data into block from previous iteration. - uint8x16x2_t samples_LUT = { { s789A, sBCDE } }; - s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; + s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); + s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); + sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); - int16x4_t d0 = - convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); - int16x4_t d1 = - convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); - int16x4_t d2 = - convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); - int16x4_t d3 = - convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; + s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); + s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); + sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); - store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); - store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + uint8x8_t d0 = + convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, + s89AB_hi, filter_0_7, filter_4_11); + uint8x8_t d1 = + convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, + s9ABC_hi, filter_0_7, filter_4_11); + uint8x8_t d2 = + convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, + sABCD_hi, filter_0_7, filter_4_11); + uint8x8_t d3 = + convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, + sBCDE_hi, filter_0_7, filter_4_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s789A; - s4567 = s89AB; - s5678 = s9ABC; - s6789 = sABCD; - s789A = sBCDE; + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s789A_lo; + s3456_hi = s789A_hi; + s4567_lo = s89AB_lo; + s4567_hi = s89AB_hi; + s5678_lo = s9ABC_lo; + s5678_hi = s9ABC_hi; + s6789_lo = sABCD_lo; + s6789_hi = sABCD_hi; + s789A_lo = sBCDE_lo; + s789A_hi = sBCDE_hi; - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - do { - int height = h; - const uint8_t *s = src_ptr; - uint8_t *d = dst_ptr; - - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; - load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, - &s9, &sA); - s += 11 * src_stride; - - // This operation combines a conventional transpose and the sample - // permute (see horizontal case) required before computing the dot - // product. - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, - s6789_lo, s6789_hi, s789A_lo, s789A_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); - transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); - - do { - uint8x8_t sB, sC, sD, sE; - load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE); - - uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, - sBCDE_lo, sBCDE_hi; - transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); - - // Merge new data into block from previous iteration. - uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; - s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); - s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); - sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); - - uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; - s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); - s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); - sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); - - uint8x8_t d0 = - convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, - s89AB_hi, filter_0_7, filter_4_11); - uint8x8_t d1 = - convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, - s9ABC_hi, filter_0_7, filter_4_11); - uint8x8_t d2 = - convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, - sABCD_hi, filter_0_7, filter_4_11); - uint8x8_t d3 = - convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, - sBCDE_hi, filter_0_7, filter_4_11); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - // Prepare block for next iteration - re-using as much as possible. - // Shuffle everything up four rows. - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s789A_lo; - s3456_hi = s789A_hi; - s4567_lo = s89AB_lo; - s4567_hi = s89AB_hi; - s5678_lo = s9ABC_lo; - s5678_hi = s9ABC_hi; - s6789_lo = sABCD_lo; - s6789_hi = sABCD_hi; - s789A_lo = sBCDE_lo; - s789A_hi = sBCDE_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src_ptr += 8; - dst_ptr += 8; - w -= 8; - } while (w != 0); - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); } } @@ -684,155 +632,126 @@ uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (y_filter_ptr[5] == 128) { - // Undo the vertical offset in the calling function. - src_ptr += 5 * src_stride; + const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + if (w == 4) { + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint8x16_t s4567, s5678, s6789, s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_y(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; - int width = w; - do { - uint8x8_t d0 = vld1_u8(s); - if (w == 4) { - store_u8_4x1(d, d0); - } else { - vst1_u8(d, d0); - } - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr)); - - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); - - if (w == 4) { uint8x8_t s0, s1, s2, s3, s4, s5, s6; - load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src_ptr += 7 * src_stride; + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; - // This operation combines a conventional transpose and the sample permute - // (see horizontal case) required before computing the dot product. - uint8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123); - transpose_concat_4x4(s1, s2, s3, s4, &s1234); - transpose_concat_4x4(s2, s3, s4, s5, &s2345); - transpose_concat_4x4(s3, s4, s5, s6, &s3456); + // This operation combines a conventional transpose and the sample + // permute (see horizontal case) required before computing the dot + // product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; - load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - uint8x16_t s4567, s5678, s6789, s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910); + uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. - uint8x16x2_t samples_LUT = { { s3456, s78910 } }; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } }; + s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); - int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); - int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); - int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); - int16x4_t d3 = convolve8_4_y(s3456, s78910, filter); - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } }; + s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); - store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); - store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + uint8x8_t d0 = + convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - do { - int height = h; - const uint8_t *s = src_ptr; - uint8_t *d = dst_ptr; - - uint8x8_t s0, s1, s2, s3, s4, s5, s6; - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - // This operation combines a conventional transpose and the sample - // permute (see horizontal case) required before computing the dot - // product. - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); - - do { - uint8x8_t s7, s8, s9, s10; - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, - s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - - // Merge new data into block from previous iteration. - uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } }; - s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); - - uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } }; - s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); - - uint8x8_t d0 = - convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); - uint8x8_t d1 = - convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); - uint8x8_t d2 = - convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); - uint8x8_t d3 = - convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - // Prepare block for next iteration - re-using as much as possible. - // Shuffle everything up four rows. - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src_ptr += 8; - dst_ptr += 8; - w -= 8; - } while (w != 0); - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); } } @@ -928,15 +847,80 @@ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11) { + // The no-op filter should never be used here. + assert(vgetq_lane_s16(x_filter_0_7, 5) != 128); + const int bd = 8; - // Special case the following no-op filter as 128 won't fit into the - // 8-bit signed dot-product instruction: - // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } - if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { - const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); - // Undo the horizontal offset in the calling function. - src_ptr += 5; + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_const = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = + convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = + convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = + convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); do { const uint8_t *s = src_ptr; @@ -944,119 +928,20 @@ int width = w; do { - uint8x8_t s0 = vld1_u8(s); - uint16x8_t d0 = vaddw_u8(horiz_const, s0); - d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); - // Store 8 elements to avoid additional branches. This is safe if the - // actual block width is < 8 because the intermediate buffer is large - // enough to accommodate 128x128 blocks. - vst1q_s16(d, vreinterpretq_s16_u16(d0)); + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); - d += 8; s += 8; + d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); - - } else { - // Narrow filter values to 8-bit. - const int16x8x2_t x_filter_s16 = { - { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } - }; - const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), - vmovn_s16(x_filter_s16.val[1])); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // - which are generally faster than rounding shifts on modern CPUs. - const int32x4_t horiz_const = - vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); - const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); - - if (w <= 4) { - do { - uint8x16_t s0, s1, s2, s3; - load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); - - int16x4_t d0 = - convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); - int16x4_t d1 = - convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const); - int16x4_t d2 = - convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const); - int16x4_t d3 = - convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const); - - store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); - - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - uint8x16_t s0 = vld1q_u8(src_ptr); - int16x4_t d0 = - convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); - vst1_s16(dst_ptr, d0); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - - } else { - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2], s1[2], s2[2], s3[2]; - load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); - load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); - - int16x8_t d0 = - convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); - int16x8_t d1 = - convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const); - int16x8_t d2 = - convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const); - int16x8_t d3 = - convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const); - - store_s16_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - - src_ptr += 4 * src_stride; - dst_ptr += 4 * dst_stride; - h -= 4; - } while (h > 4); - - do { - const uint8_t *s = src_ptr; - int16_t *d = dst_ptr; - int width = w; - - do { - uint8x16_t s0[2]; - s0[0] = vld1q_u8(s); - s0[1] = vld1q_u8(s + 4); - int16x8_t d0 = - convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); - vst1q_s16(d, d0); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } } }