Refactor and tidy up av1_convolve_2d_sr_neon
This is the first of two patches to refactor av1_convolve_2d_sr_neon
such that we can more easily implement optimizations later.
The main changes are:
1) Remove tests and control logic for 2x* and *x2 block sizes as the
minimum is 4x* and *x4.
2) Remove prefetch hints as they are often ignored - but still have
to be fetched and decoded. They also increase binary size.
Change-Id: Ib86b848ff1016930f6280aa6d260031d8bb3a0d0
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 28009d8..f39e797 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -892,10 +892,12 @@
// Horizontal filtering for convolve_2d_sr for width <= 4
// Processes one row at a time
-static INLINE void horiz_filter_w4_single_row(
- const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
- const int dst_stride, int width, int height, const int16x8_t x_filter,
- const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+static INLINE void horiz_filter_w4_single_row(const uint8_t *src_ptr,
+ int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int height,
+ const int16x8_t x_filter,
+ const int16x4_t horiz_const,
+ const int16x4_t shift_round_0) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
do {
const uint8_t *s = src_ptr;
@@ -923,14 +925,9 @@
int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
- if (width == 4) {
- vst1_s16(dst_ptr, d0);
- dst_ptr += dst_stride;
- } else if (width == 2) {
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
- dst_ptr += dst_stride;
- }
+ vst1_s16(dst_ptr, d0);
+ dst_ptr += dst_stride;
src_ptr += src_stride;
height--;
} while (height > 0);
@@ -948,12 +945,9 @@
subpel_y_qn, conv_params);
return;
}
- int im_dst_stride;
- int width, height;
+
#if defined(__aarch64__)
- uint8x8_t t0;
- uint8x8_t t1, t2, t3, t4, t5, t6, t7;
- const uint8_t *s;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
#endif
DECLARE_ALIGNED(16, int16_t,
@@ -966,13 +960,11 @@
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ int16_t *dst_ptr = im_block;
- int16_t *dst_ptr;
-
- dst_ptr = im_block;
- im_dst_stride = im_stride;
- height = im_h;
- width = w;
+ int im_dst_stride = im_stride;
+ int width = w;
+ int height = im_h;
const int16_t round_bits =
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
@@ -987,19 +979,16 @@
assert(conv_params->round_0 > 0);
- if (w <= 4) {
+ if (w == 4) {
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
#if defined(__aarch64__)
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
do {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ const uint8_t *s = src_ptr;
+
assert(height >= 4);
- s = src_ptr;
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
@@ -1012,10 +1001,6 @@
s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
s += 7;
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
@@ -1036,21 +1021,12 @@
horiz_const, shift_round_0);
transpose_s16_4x4d(&d0, &d1, &d2, &d3);
- if (w == 4) {
- vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
- vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
- vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
- vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
- } else if (w == 2) {
- vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
- vreinterpret_u32_s16(d0), 0);
- vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
- vreinterpret_u32_s16(d1), 0);
- vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
- vreinterpret_u32_s16(d2), 0);
- vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
- vreinterpret_u32_s16(d3), 0);
- }
+
+ vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+ vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+ vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+ vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+
src_ptr += 4 * src_stride;
dst_ptr += 4 * im_dst_stride;
height -= 4;
@@ -1058,32 +1034,26 @@
if (height) {
assert(height < 4);
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride,
height, x_filter, horiz_const, shift_round_0);
}
-#else
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+
+#else // !defined(__aarch64__)
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride,
height, x_filter, horiz_const, shift_round_0);
-#endif
+#endif // defined(__aarch64__)
} else {
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
#if defined(__aarch64__)
- int16_t *d_tmp;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
- do {
- assert(height >= 8);
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
+
+ for (; height >= 8; height -= 8) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ d0, d1, d2, d3, d4, d5, d6, d7;
+ const uint8_t *s;
+ int16_t *d;
load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1099,16 +1069,7 @@
width = w;
s = src_ptr + 7;
- d_tmp = dst_ptr;
-
- __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+ d = dst_ptr;
do {
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1124,28 +1085,26 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const, shift_round_0);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const, shift_round_0);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const, shift_round_0);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const, shift_round_0);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- horiz_const, shift_round_0);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- horiz_const, shift_round_0);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- horiz_const, shift_round_0);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
- horiz_const, shift_round_0);
+ d0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const, shift_round_0);
+ d1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ horiz_const, shift_round_0);
+ d2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ horiz_const, shift_round_0);
+ d3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ horiz_const, shift_round_0);
+ d4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+ horiz_const, shift_round_0);
+ d5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+ horiz_const, shift_round_0);
+ d6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+ horiz_const, shift_round_0);
+ d7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+ horiz_const, shift_round_0);
- transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
- &res7);
+ transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
- res6, res7);
+ store_s16_8x8(d, im_dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
s0 = s8;
s1 = s9;
@@ -1155,133 +1114,103 @@
s5 = s13;
s6 = s14;
s += 8;
- d_tmp += 8;
+ d += 8;
width -= 8;
} while (width > 0);
+
src_ptr += 8 * src_stride;
dst_ptr += 8 * im_dst_stride;
- height -= 8;
- } while (height >= 8);
+ }
- if (height >= 4) {
- assert(height < 8);
- int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
- reg10, reg11, reg12, reg13, reg14;
- int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16x8_t out0, out1, out2, out3;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
+ for (; height >= 4; height -= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ dd0, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
+ int16x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ int16_t *d;
load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
s = src_ptr + 7;
- d_tmp = dst_ptr;
+ d = dst_ptr;
width = w;
do {
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
- x_filter);
+ dd0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+ dd1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+ dd2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+ dd3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+ dd4 = convolve8_4x4(s4, s5, s6, s7, s8, s9, s10, s11, x_filter);
+ dd5 = convolve8_4x4(s5, s6, s7, s8, s9, s10, s11, s12, x_filter);
+ dd6 = convolve8_4x4(s6, s7, s8, s9, s10, s11, s12, s13, x_filter);
+ dd7 = convolve8_4x4(s7, s8, s9, s10, s11, s12, s13, s14, x_filter);
- d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
- x_filter);
+ transpose_s16_4x8(&dd0, &dd1, &dd2, &dd3, &dd4, &dd5, &dd6, &dd7, &d0,
+ &d1, &d2, &d3);
- d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
- x_filter);
+ d0 = vaddq_s16(d0, horiz_const);
+ d1 = vaddq_s16(d1, horiz_const);
+ d2 = vaddq_s16(d2, horiz_const);
+ d3 = vaddq_s16(d3, horiz_const);
- d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
- x_filter);
+ d0 = vqrshlq_s16(d0, shift_round_0);
+ d1 = vqrshlq_s16(d1, shift_round_0);
+ d2 = vqrshlq_s16(d2, shift_round_0);
+ d3 = vqrshlq_s16(d3, shift_round_0);
- d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
- x_filter);
+ store_s16_8x4(d, im_dst_stride, d0, d1, d2, d3);
- d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
- x_filter);
-
- d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
- x_filter);
-
- d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
- x_filter);
-
- transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
- &out2, &out3);
-
- out0 = vaddq_s16(out0, horiz_const);
- out0 = vqrshlq_s16(out0, shift_round_0);
-
- out1 = vaddq_s16(out1, horiz_const);
- out1 = vqrshlq_s16(out1, shift_round_0);
-
- out2 = vaddq_s16(out2, horiz_const);
- out2 = vqrshlq_s16(out2, shift_round_0);
-
- out3 = vaddq_s16(out3, horiz_const);
- out3 = vqrshlq_s16(out3, shift_round_0);
-
- store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
-
- reg0 = reg8;
- reg1 = reg9;
- reg2 = reg10;
- reg3 = reg11;
- reg4 = reg12;
- reg5 = reg13;
- reg6 = reg14;
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
s += 8;
- d_tmp += 8;
+ d += 8;
width -= 8;
} while (width > 0);
+
src_ptr += 4 * src_stride;
dst_ptr += 4 * im_dst_stride;
- height -= 4;
}
if (height) {
assert(height < 4);
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
height, x_filter, horiz_const, shift_round_0);
}
-#else
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+#else // !defined(__aarch64__)
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
height, x_filter, horiz_const, shift_round_0);
-#endif
+#endif // defined(__aarch64__)
}
// vertical
{
- uint8_t *dst_u8_ptr, *d_u8;
- int16_t *v_src_ptr, *v_s;
-
const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
@@ -1293,13 +1222,13 @@
const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
src_stride = im_stride;
- v_src_ptr = im_block;
- dst_u8_ptr = dst;
+ int16_t *v_src_ptr = im_block;
+ uint8_t *v_dst_ptr = dst;
height = h;
width = w;
- if (width <= 4) {
+ if (width == 4) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
int16x8_t dd0;
uint8x8_t d01;
@@ -1308,32 +1237,18 @@
int16x4_t s8, s9, s10, d1, d2, d3;
int16x8_t dd1;
uint8x8_t d23;
-#endif
+#endif // defined(__aarch64__)
- d_u8 = dst_u8_ptr;
- v_s = v_src_ptr;
+ int16_t *s = v_src_ptr;
+ uint8_t *d = v_dst_ptr;
- __builtin_prefetch(v_s + 0 * im_stride);
- __builtin_prefetch(v_s + 1 * im_stride);
- __builtin_prefetch(v_s + 2 * im_stride);
- __builtin_prefetch(v_s + 3 * im_stride);
- __builtin_prefetch(v_s + 4 * im_stride);
- __builtin_prefetch(v_s + 5 * im_stride);
- __builtin_prefetch(v_s + 6 * im_stride);
- __builtin_prefetch(v_s + 7 * im_stride);
-
- load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- v_s += (7 * im_stride);
+ load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ s += (7 * src_stride);
do {
#if defined(__aarch64__)
- load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
- v_s += (im_stride << 2);
-
- __builtin_prefetch(d_u8 + 0 * dst_stride);
- __builtin_prefetch(d_u8 + 1 * dst_stride);
- __builtin_prefetch(d_u8 + 2 * dst_stride);
- __builtin_prefetch(d_u8 + 3 * dst_stride);
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+ s += (4 * src_stride);
d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
round_shift_vec, offset_const,
@@ -1354,47 +1269,14 @@
d01 = vqmovun_s16(dd0);
d23 = vqmovun_s16(dd1);
- if ((w == 4) && (h != 2)) {
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 0); // 00 01 02 03
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 1); // 10 11 12 13
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
- 0); // 20 21 22 23
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
- 1); // 30 31 32 33
- d_u8 += dst_stride;
- } else if ((w == 2) && (h != 2)) {
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 0); // 00 01
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 2); // 10 11
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
- 0); // 20 21
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
- 2); // 30 31
- d_u8 += dst_stride;
- } else if ((w == 4) && (h == 2)) {
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 0); // 00 01 02 03
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 1); // 10 11 12 13
- d_u8 += dst_stride;
- } else if ((w == 2) && (h == 2)) {
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 0); // 00 01
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 2); // 10 11
- d_u8 += dst_stride;
- }
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 1);
+ d += dst_stride;
s0 = s4;
s1 = s5;
@@ -1404,11 +1286,9 @@
s5 = s9;
s6 = s10;
height -= 4;
-#else
- s7 = vld1_s16(v_s);
- v_s += im_stride;
-
- __builtin_prefetch(d_u8 + 0 * dst_stride);
+#else // !defined(__aarch64__)
+ s7 = vld1_s16(s);
+ s += src_stride;
d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
round_shift_vec, offset_const,
@@ -1417,16 +1297,8 @@
dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
d01 = vqmovun_s16(dd0);
- if (w == 4) {
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 0); // 00 01 02 03
- d_u8 += dst_stride;
-
- } else if (w == 2) {
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 0); // 00 01
- d_u8 += dst_stride;
- }
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
+ d += dst_stride;
s0 = s1;
s1 = s2;
@@ -1435,73 +1307,54 @@
s4 = s5;
s5 = s6;
s6 = s7;
- height -= 1;
-#endif
+ height--;
+#endif // defined(__aarch64__)
} while (height > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint8x8_t res0;
+ uint8x8_t d0;
#if defined(__aarch64__)
int16x8_t s8, s9, s10;
- uint8x8_t res1, res2, res3;
-#endif
+ uint8x8_t d1, d2, d3;
+#endif // defined(__aarch64__)
do {
- __builtin_prefetch(v_src_ptr + 0 * im_stride);
- __builtin_prefetch(v_src_ptr + 1 * im_stride);
- __builtin_prefetch(v_src_ptr + 2 * im_stride);
- __builtin_prefetch(v_src_ptr + 3 * im_stride);
- __builtin_prefetch(v_src_ptr + 4 * im_stride);
- __builtin_prefetch(v_src_ptr + 5 * im_stride);
- __builtin_prefetch(v_src_ptr + 6 * im_stride);
- __builtin_prefetch(v_src_ptr + 7 * im_stride);
+ int16_t *s = v_src_ptr;
+ uint8_t *d = v_dst_ptr;
- v_s = v_src_ptr;
- load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- v_s += (7 * im_stride);
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ s += (7 * src_stride);
- d_u8 = dst_u8_ptr;
height = h;
do {
#if defined(__aarch64__)
- load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
- v_s += (im_stride << 2);
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+ s += (4 * src_stride);
- __builtin_prefetch(d_u8 + 4 * dst_stride);
- __builtin_prefetch(d_u8 + 5 * dst_stride);
- __builtin_prefetch(d_u8 + 6 * dst_stride);
- __builtin_prefetch(d_u8 + 7 * dst_stride);
+ d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ d1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ d2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ d3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
- res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
+ vst1_u8(d, d0);
+ d += dst_stride;
+ vst1_u8(d, d1);
+ d += dst_stride;
+ vst1_u8(d, d2);
+ d += dst_stride;
+ vst1_u8(d, d3);
+ d += dst_stride;
- if (h != 2) {
- vst1_u8(d_u8, res0);
- d_u8 += dst_stride;
- vst1_u8(d_u8, res1);
- d_u8 += dst_stride;
- vst1_u8(d_u8, res2);
- d_u8 += dst_stride;
- vst1_u8(d_u8, res3);
- d_u8 += dst_stride;
- } else {
- vst1_u8(d_u8, res0);
- d_u8 += dst_stride;
- vst1_u8(d_u8, res1);
- d_u8 += dst_stride;
- }
s0 = s4;
s1 = s5;
s2 = s6;
@@ -1510,18 +1363,16 @@
s5 = s9;
s6 = s10;
height -= 4;
-#else
- s7 = vld1q_s16(v_s);
- v_s += im_stride;
+#else // !defined(__aarch64__)
+ s7 = vld1q_s16(s);
+ s += src_stride;
- __builtin_prefetch(d_u8 + 0 * dst_stride);
+ d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
- res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
-
- vst1_u8(d_u8, res0);
- d_u8 += dst_stride;
+ vst1_u8(d, d0);
+ d += dst_stride;
s0 = s1;
s1 = s2;
@@ -1530,11 +1381,12 @@
s4 = s5;
s5 = s6;
s6 = s7;
- height -= 1;
-#endif
+ height--;
+#endif // defined(__aarch64__)
} while (height > 0);
+
v_src_ptr += 8;
- dst_u8_ptr += 8;
+ v_dst_ptr += 8;
w -= 8;
} while (w > 0);
}