Refactor Neon compound convolution functions 5/7
Reafactor the helper functions implementing the vertical pass of
av1_dist_wtd_convolve_2d_neon for both 6-tap and 8-tap filters. This
is mostly just cosmetic changes, as well as moving the convolution
kernel helper functions into the .c file from the convolve_neon.h
header.
Change-Id: I8b163ba4478d163608061278f491120a5f39e3e7
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index 520a393..cbca376 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -508,108 +508,6 @@
return sum;
}
-static INLINE uint16x4_t convolve6_4_s32(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x8_t y_filter,
- const int32x4_t offset_const) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
- int32x4_t sum = offset_const;
- sum = vmlal_lane_s16(sum, s0, y_filter_lo, 1);
- sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
- sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
- sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
- sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
- sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x8_t convolve6_8_s32(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t y_filter,
- const int32x4_t offset_const) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
- int32x4_t sum0 = offset_const;
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_lo, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
-
- int32x4_t sum1 = offset_const;
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_lo, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
-static INLINE uint16x4_t convolve8_4_s32(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7,
- const int16x8_t y_filter,
- const int32x4_t offset_const) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
- int32x4_t sum = offset_const;
- sum = vmlal_lane_s16(sum, s0, y_filter_lo, 0);
- sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
- sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
- sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
- sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
- sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
- sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
- sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
-
- return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
-}
-
-static INLINE uint16x8_t convolve8_8_s32(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x8_t s4, const int16x8_t s5,
- const int16x8_t s6, const int16x8_t s7,
- const int16x8_t y_filter,
- const int32x4_t offset_const) {
- const int16x4_t y_filter_lo = vget_low_s16(y_filter);
- const int16x4_t y_filter_hi = vget_high_s16(y_filter);
-
- int32x4_t sum0 = offset_const;
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_lo, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
- sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
-
- int32x4_t sum1 = offset_const;
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_lo, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
- sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
-
- return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
- vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
-}
-
#if !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE int16x4_t convolve8_horiz_4x4_s16(
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index af4cf8f..02f6370 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -592,28 +592,75 @@
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint16x4_t
+convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = offset_const;
+ // Filter values at indices 0 and 7 are 0.
+ sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = offset_const;
+ // Filter values at indices 0 and 7 are 0.
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = offset_const;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
(1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
+
const int do_average = conv_params->do_average;
const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
if (w == 4) {
int16x4_t s0, s1, s2, s3, s4, s5;
uint16x4_t dd0, d0;
uint8x8_t d01_u8;
-
#if defined(__aarch64__)
int16x4_t s6, s7, s8;
uint16x4_t dd1, dd2, dd3, d1, d2, d3;
@@ -627,10 +674,10 @@
#if defined(__aarch64__)
load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
- d0 = convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_4_s32(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_4_s32(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_4_s32(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+ d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
if (do_average) {
load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -659,7 +706,7 @@
#else // !defined(__aarch64__)
s5 = vld1_s16(src_ptr);
- d0 = convolve6_4_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
if (do_average) {
dd0 = vld1_u16(dst_ptr);
@@ -683,13 +730,11 @@
dst_ptr += dst_stride;
h--;
#endif // defined(__aarch64__)
- } while (h > 0);
-
+ } while (h != 0);
} else {
int16x8_t s0, s1, s2, s3, s4, s5;
uint16x8_t dd0, d0;
uint8x8_t d0_u8;
-
#if defined(__aarch64__)
int16x8_t s6, s7, s8;
uint16x8_t dd1, dd2, dd3, d1, d2, d3;
@@ -698,7 +743,7 @@
do {
int16_t *s = src_ptr;
- uint16_t *d = dst_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
uint8_t *d_u8 = dst8_ptr;
int height = h;
@@ -709,10 +754,10 @@
#if defined(__aarch64__)
load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
- d0 = convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
- d1 = convolve6_8_s32(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
- d2 = convolve6_8_s32(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
- d3 = convolve6_8_s32(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+ d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
if (do_average) {
load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -738,7 +783,7 @@
#else // !defined(__aarch64__)
s5 = vld1q_s16(s);
- d0 = convolve6_8_s32(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
if (do_average) {
dd0 = vld1q_u16(d);
@@ -757,43 +802,94 @@
s2 = s3;
s3 = s4;
s4 = s5;
-
s += src_stride;
d += dst_stride;
height--;
#endif // defined(__aarch64__)
- } while (height > 0);
-
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
dst8_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}
+static INLINE uint16x4_t
+convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = offset_const;
+ sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = offset_const;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = offset_const;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
- CONV_BUF_TYPE *dst_ptr = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
-
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
(1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
- const uint16_t fwd_offset = conv_params->fwd_offset;
- const uint16_t bck_offset = conv_params->bck_offset;
+
const int do_average = conv_params->do_average;
const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
if (w == 4) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x4_t dd0, d0;
uint8x8_t d01_u8;
-
#if defined(__aarch64__)
int16x4_t s8, s9, s10;
uint16x4_t dd1, dd2, dd3, d1, d2, d3;
@@ -807,14 +903,14 @@
#if defined(__aarch64__)
load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
- d0 = convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
+ d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ offset_const);
if (do_average) {
load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -845,8 +941,8 @@
#else // !defined(__aarch64__)
s7 = vld1_s16(src_ptr);
- d0 = convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
+ d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
if (do_average) {
dd0 = vld1_u16(dst_ptr);
@@ -872,13 +968,11 @@
dst_ptr += dst_stride;
h--;
#endif // defined(__aarch64__)
- } while (h > 0);
-
+ } while (h != 0);
} else {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x8_t dd0, d0;
uint8x8_t d0_u8;
-
#if defined(__aarch64__)
int16x8_t s8, s9, s10;
uint16x8_t dd1, dd2, dd3, d1, d2, d3;
@@ -887,7 +981,7 @@
do {
int16_t *s = src_ptr;
- uint16_t *d = dst_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
uint8_t *d_u8 = dst8_ptr;
int height = h;
@@ -898,14 +992,14 @@
#if defined(__aarch64__)
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- d0 = convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
- d1 = convolve8_8_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- offset_const);
- d2 = convolve8_8_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- offset_const);
- d3 = convolve8_8_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- offset_const);
+ d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ offset_const);
if (do_average) {
load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
@@ -933,8 +1027,8 @@
#else // !defined(__aarch64__)
s7 = vld1q_s16(s);
- d0 = convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- offset_const);
+ d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
if (do_average) {
dd0 = vld1q_u16(d);
@@ -959,13 +1053,12 @@
d += dst_stride;
height--;
#endif // defined(__aarch64__)
- } while (height > 0);
-
+ } while (height != 0);
src_ptr += 8;
dst_ptr += 8;
dst8_ptr += 8;
w -= 8;
- } while (w > 0);
+ } while (w != 0);
}
}