Refactor implementation of av1_dist_wtd_convolve_y_neon
Refactor and tidy up the 8-tap implementation of
av1_dist_wtd_convolve_y_neon. This is a first step before
specializing the implementation for 6-tap filters in a
subsequent patch.
Change-Id: I4343167b8b985a032ef23f7005b4669d2fa7b796
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 4293443..6c1e38e 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -2011,6 +2011,48 @@
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+static INLINE int16x4_t
+convolve8_y_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter, const int16x4_t shift_round_0) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+
+ return vqrshl_s16(sum, shift_round_0);
+}
+
+static INLINE int16x8_t
+convolve8_y_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter, const int16x8_t shift_round_0) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+ return vqrshlq_s16(sum, shift_round_0);
+}
+
void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
const InterpFilterParams *filter_params_y,
@@ -2019,9 +2061,18 @@
assert(!(w % 4));
assert(!(h % 4));
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
+ // vertical filter
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ // Filter values are even, so downshift by 1 to reduce intermediate
+ // precision requirements.
+ const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
+
const int vert_offset = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
const int bits = FILTER_BITS - conv_params->round_0;
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -2034,75 +2085,54 @@
const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int shift_value = (conv_params->round_1 - 1 - bits);
- // vertical filter
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
- const uint8_t *src_ptr = src - (vert_offset * src_stride);
-
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
-
- const uint8_t *s;
- uint8_t *d_u8;
- uint8_t *dst_u8_ptr;
- CONV_BUF_TYPE *d, *dst_ptr;
- int width, height;
-
- s = src_ptr;
- dst_ptr = dst;
- dst_u8_ptr = dst8;
- width = w;
- height = h;
-
// used to get rid of multiplication = (vertical filter output sum) *
// (1<<bits).
assert((conv_params->round_1 - 2) >= bits);
if ((w == 4) || (h == 4)) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
- uint16x4_t res4;
- uint8x8_t tu0 = vdup_n_u8(0);
- uint8x8_t tu1 = vdup_n_u8(0);
- uint8x8_t tu2 = vdup_n_u8(0);
- uint8x8_t tu3 = vdup_n_u8(0);
- int16x8_t u0, u1, u2, u3;
- uint8x8_t t0;
+ uint16x4_t dd0;
+ uint8x8_t t0 = vdup_n_u8(0);
+ uint8x8_t t1 = vdup_n_u8(0);
+ uint8x8_t t2 = vdup_n_u8(0);
+ uint8x8_t t3 = vdup_n_u8(0);
+ int16x8_t tt0, tt1, tt2, tt3;
+ uint8x8_t d01;
#if defined(__aarch64__)
int16x4_t s8, s9, s10, d1, d2, d3;
- uint16x4_t res5, res6, res7;
- uint8x8_t t1;
+ uint16x4_t dd1, dd2, dd3;
+ uint8x8_t d23;
#endif
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x4_t shift_vec = vdup_n_s16(-shift_value);
- const int16x4_t zero = vdup_n_s16(0);
+ int width = w;
do {
- s = src_ptr;
- d = dst_ptr;
- d_u8 = dst_u8_ptr;
- height = h;
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8;
+ int height = h;
+
__builtin_prefetch(s + 0 * src_stride);
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
- load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
+ load_unaligned_u8_4x8(s, src_stride, &t0, &t1, &t2, &t3);
- u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
- u1 = vreinterpretq_s16_u16(vmovl_u8(tu1));
- u2 = vreinterpretq_s16_u16(vmovl_u8(tu2));
- u3 = vreinterpretq_s16_u16(vmovl_u8(tu3));
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s0 = vget_low_s16(u0);
- s1 = vget_high_s16(u0);
- s2 = vget_low_s16(u1);
- s3 = vget_high_s16(u1);
- s4 = vget_low_s16(u2);
- s5 = vget_high_s16(u2);
- s6 = vget_low_s16(u3);
+ s0 = vget_low_s16(tt0);
+ s1 = vget_high_s16(tt0);
+ s2 = vget_low_s16(tt1);
+ s3 = vget_high_s16(tt1);
+ s4 = vget_low_s16(tt2);
+ s5 = vget_high_s16(tt2);
+ s6 = vget_low_s16(tt3);
__builtin_prefetch(d + 0 * dst_stride);
__builtin_prefetch(d + 1 * dst_stride);
@@ -2112,27 +2142,28 @@
s += 7 * src_stride;
do {
#if defined(__aarch64__)
- load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+ load_unaligned_u8_4x4(s, src_stride, &t0, &t1);
- u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
- u1 = vreinterpretq_s16_u16(vmovl_u8(tu1));
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s7 = vget_low_s16(u0);
- s8 = vget_high_s16(u0);
- s9 = vget_low_s16(u1);
- s10 = vget_high_s16(u1);
+ s7 = vget_low_s16(tt0);
+ s8 = vget_high_s16(tt0);
+ s9 = vget_low_s16(tt1);
+ s10 = vget_high_s16(tt1);
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
- shift_vec);
+ d0 = convolve8_y_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ shift_vec);
+ d1 = convolve8_y_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ shift_vec);
+ d2 = convolve8_y_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ shift_vec);
+ d3 = convolve8_y_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ shift_vec);
+
d0 = vadd_s16(d0, round_offset64);
- d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
- shift_vec);
d1 = vadd_s16(d1, round_offset64);
- d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
- shift_vec);
d2 = vadd_s16(d2, round_offset64);
- d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, zero,
- shift_vec);
d3 = vadd_s16(d3, round_offset64);
if (conv_params->do_average) {
@@ -2146,18 +2177,18 @@
__builtin_prefetch(d_u8 + 2 * dst8_stride);
__builtin_prefetch(d_u8 + 3 * dst8_stride);
- load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
- compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+ compute_avg_4x4(dd0, dd1, dd2, dd3, vreinterpret_u16_s16(d0),
vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
round_offset64, round_bits, use_dist_wtd_comp_avg,
- &t0, &t1);
+ &d01, &d23);
- store_u8_4x1(d_u8 + 0 * dst8_stride, t0, 0);
- store_u8_4x1(d_u8 + 1 * dst8_stride, t0, 1);
- store_u8_4x1(d_u8 + 2 * dst8_stride, t1, 0);
- store_u8_4x1(d_u8 + 3 * dst8_stride, t1, 1);
+ store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0);
+ store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1);
+ store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0);
+ store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1);
} else {
store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
@@ -2176,25 +2207,25 @@
d_u8 += 4 * dst8_stride;
height -= 4;
#else
- tu0 = load_unaligned_u8_4x1(s);
- u0 = vreinterpretq_s16_u16(vmovl_u8(tu0));
- s7 = vget_low_s16(u0);
+ t0 = load_unaligned_u8_4x1(s);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s7 = vget_low_s16(tt0);
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
- shift_vec);
+ d0 = convolve8_y_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ shift_vec);
d0 = vadd_s16(d0, round_offset64);
if (conv_params->do_average) {
__builtin_prefetch(d);
- res4 = vld1_u16(d);
+ dd0 = vld1_u16(d);
- compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
- bck_offset, round_offset64, round_bits,
- use_dist_wtd_comp_avg, &t0);
+ compute_avg_4x1(dd0, vreinterpret_u16_s16(d0), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &d01);
- store_u8_4x1(d_u8, t0, 0);
+ store_u8_4x1(d_u8, d01, 0);
} else {
vst1_u16(d, vreinterpret_u16_s16(d0));
}
@@ -2214,37 +2245,39 @@
} while (height > 0);
src_ptr += 4;
dst_ptr += 4;
- dst_u8_ptr += 4;
+ dst8 += 4;
width -= 4;
} while (width > 0);
} else {
- CONV_BUF_TYPE *d_tmp;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- int16x8_t res0;
- uint16x8_t res8;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
- const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ uint16x8_t dd0;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+
#if defined(__aarch64__)
- int16x8_t s8, s9, s10, s11, s12, s13, s14;
- int16x8_t res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t res9, res10, res11;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, d1, d2, d3, d4, d5, d6, d7;
+ uint16x8_t dd1, dd2, dd3;
uint8x8_t t7;
#endif
- dst_ptr = dst;
- dst_u8_ptr = dst8;
+ int width = w;
+
do {
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
- load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -2254,10 +2287,7 @@
s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- height = h;
- s = src_ptr + (7 * src_stride);
- d_tmp = dst_ptr;
- d_u8 = dst_u8_ptr;
+ s += 7 * src_stride;
do {
#if defined(__aarch64__)
@@ -2277,70 +2307,68 @@
__builtin_prefetch(dst_ptr + 2 * dst_stride);
__builtin_prefetch(dst_ptr + 3 * dst_stride);
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+ d0 = convolve8_y_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
shift_vec);
- res0 = vaddq_s16(res0, round_offset128);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, zero,
+ d1 = convolve8_y_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
shift_vec);
- res1 = vaddq_s16(res1, round_offset128);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, zero,
+ d2 = convolve8_y_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
shift_vec);
- res2 = vaddq_s16(res2, round_offset128);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- zero, shift_vec);
- res3 = vaddq_s16(res3, round_offset128);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
- zero, shift_vec);
- res4 = vaddq_s16(res4, round_offset128);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
- zero, shift_vec);
- res5 = vaddq_s16(res5, round_offset128);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
- zero, shift_vec);
- res6 = vaddq_s16(res6, round_offset128);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
- zero, shift_vec);
- res7 = vaddq_s16(res7, round_offset128);
+ d3 = convolve8_y_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ shift_vec);
+ d4 = convolve8_y_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter,
+ shift_vec);
+ d5 = convolve8_y_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, y_filter,
+ shift_vec);
+ d6 = convolve8_y_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, y_filter,
+ shift_vec);
+ d7 = convolve8_y_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, y_filter,
+ shift_vec);
+
+ d0 = vaddq_s16(d0, round_offset128);
+ d1 = vaddq_s16(d1, round_offset128);
+ d2 = vaddq_s16(d2, round_offset128);
+ d3 = vaddq_s16(d3, round_offset128);
+ d4 = vaddq_s16(d4, round_offset128);
+ d5 = vaddq_s16(d5, round_offset128);
+ d6 = vaddq_s16(d6, round_offset128);
+ d7 = vaddq_s16(d7, round_offset128);
if (conv_params->do_average) {
- __builtin_prefetch(d_tmp + 0 * dst8_stride);
- __builtin_prefetch(d_tmp + 1 * dst8_stride);
- __builtin_prefetch(d_tmp + 2 * dst8_stride);
- __builtin_prefetch(d_tmp + 3 * dst8_stride);
+ __builtin_prefetch(d + 0 * dst8_stride);
+ __builtin_prefetch(d + 1 * dst8_stride);
+ __builtin_prefetch(d + 2 * dst8_stride);
+ __builtin_prefetch(d + 3 * dst8_stride);
- load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
- d_tmp += 4 * dst_stride;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+ d += 4 * dst_stride;
- compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
- vreinterpretq_u16_s16(res1),
- vreinterpretq_u16_s16(res2),
- vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ compute_avg_8x4(dd0, dd1, dd2, dd3, vreinterpretq_u16_s16(d0),
+ vreinterpretq_u16_s16(d1), vreinterpretq_u16_s16(d2),
+ vreinterpretq_u16_s16(d3), fwd_offset, bck_offset,
round_offset64, round_bits, use_dist_wtd_comp_avg,
&t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += 4 * dst8_stride;
- load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
- d_tmp += 4 * dst_stride;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+ d += 4 * dst_stride;
- compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
- vreinterpretq_u16_s16(res5),
- vreinterpretq_u16_s16(res6),
- vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ compute_avg_8x4(dd0, dd1, dd2, dd3, vreinterpretq_u16_s16(d4),
+ vreinterpretq_u16_s16(d5), vreinterpretq_u16_s16(d6),
+ vreinterpretq_u16_s16(d7), fwd_offset, bck_offset,
round_offset64, round_bits, use_dist_wtd_comp_avg,
&t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += 4 * dst8_stride;
} else {
- store_u16_8x8(
- d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
- vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
- vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
- vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
- vreinterpretq_u16_s16(res7));
- d_tmp += 8 * dst_stride;
+ store_u16_8x8(d, dst_stride, vreinterpretq_u16_s16(d0),
+ vreinterpretq_u16_s16(d1), vreinterpretq_u16_s16(d2),
+ vreinterpretq_u16_s16(d3), vreinterpretq_u16_s16(d4),
+ vreinterpretq_u16_s16(d5), vreinterpretq_u16_s16(d6),
+ vreinterpretq_u16_s16(d7));
+ d += 8 * dst_stride;
}
s0 = s8;
@@ -2357,9 +2385,9 @@
__builtin_prefetch(dst_ptr);
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, zero,
+ d0 = convolve8_y_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
shift_vec);
- res0 = vaddq_s16(res0, round_offset128);
+ d0 = vaddq_s16(d0, round_offset128);
s0 = s1;
s1 = s2;
@@ -2370,20 +2398,20 @@
s6 = s7;
if (conv_params->do_average) {
- __builtin_prefetch(d_tmp);
+ __builtin_prefetch(d);
- res8 = vld1q_u16(d_tmp);
- d_tmp += dst_stride;
+ dd0 = vld1q_u16(d);
+ d += dst_stride;
- compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ compute_avg_8x1(dd0, vreinterpretq_u16_s16(d0), fwd_offset,
bck_offset, round_offset64, round_bits,
use_dist_wtd_comp_avg, &t0);
vst1_u8(d_u8, t0);
d_u8 += dst8_stride;
} else {
- vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
- d_tmp += dst_stride;
+ vst1q_u16(d, vreinterpretq_u16_s16(d0));
+ d += dst_stride;
}
s += src_stride;
@@ -2392,7 +2420,7 @@
} while (height > 0);
src_ptr += 8;
dst_ptr += 8;
- dst_u8_ptr += 8;
+ dst8 += 8;
width -= 8;
} while (width > 0);
}