Revert "Refactor and tidy up av1_convolve_2d_sr_neon"
This reverts commit aac4e7f3f7223108315e83f2ec1eec23bdbac02e.
This causes threaded test failures:
AV1/AVxEncoderThreadTest.EncoderResultTest/*
AV1/TileIndependenceLSTest.MD5Match/*
AV1MultiThreaded/TestVectorTest.MD5Match/*
The follow up commits are also included in this revert:
Revert "Optimize av1_dist_wtd_convolve_2d_neon using SDOT instruction"
This reverts commit 147a1f52bec96c7eec86583d229ac9c5c90fe5fc.
Revert "Optimize av1_convolve_2d_sr_neon using SDOT instruction"
This reverts commit 2cc0666de131c2b44db08d0f51c07a2d7cd4073f.
Revert "Split av1_convolve_2d_sr_neon into horizontal/vertical helpers"
This reverts commit 9dfc5b34d857de0003b888594fc7dc62c8504d0b.
Change-Id: I7516a596bd9b39fa0ca70dec154aeea9dafda77f
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 40be27d..1a93f35 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -15,63 +15,29 @@
#include <string.h>
#include "aom_dsp/aom_dsp_common.h"
-// Support for xN Neon intrinsics is lacking in some compilers.
-#if defined(__arm__) || defined(_M_ARM)
-#define ARM_32_BIT
-#endif
-
-// DEFICIENT_CLANG_32_BIT includes clang-cl.
-#if defined(__clang__) && defined(ARM_32_BIT) && \
- (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
-#define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
-#define GCC_32_BIT
-#endif
-
-#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
-
-static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
- uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
- vld1q_u8(ptr + 2 * 16) } };
- return res;
-}
-
-static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+// Support for these xN intrinsics is lacking in older compilers.
+#if (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) || \
+ (defined(__GNUC__) && \
+ ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) || \
+ (defined(__clang__) && defined(__arm__) && \
+ (__clang_major__ <= 6 || \
+ (defined(__ANDROID__) && __clang_major__ <= 7)))))
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
return res;
}
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+static INLINE uint16x8x4_t vld1q_u16_x4(uint16_t const *ptr) {
uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
return res;
}
-
-#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
-#if __GNUC__ < 8
-
-static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
- uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
- return res;
-}
-
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
- uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
- vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
- return res;
-}
-#endif // __GNUC__ < 8
-
-#if __GNUC__ < 9
-static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
- uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
- vld1q_u8(ptr + 2 * 16) } };
- return res;
-}
-#endif // __GNUC__ < 9
-#endif // defined(__GNUC__) && !defined(__clang__)
+#endif // (defined(_MSC_VER) && !defined(__clang__) && !defined(_M_ARM64)) ||
+ // (defined(__GNUC__) &&
+ // ((!defined(__clang__) && (__GNUC__ < 8 || defined(__arm__))) ||
+ // (defined(__clang__) && defined(__arm__) &&
+ // (__clang_major__ <= 6 ||
+ // (defined(__ANDROID__) && __clang_major__ <= 7)))))
static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
const uint8x8_t s1) {
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index ff51b6c..28009d8 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -844,152 +844,6 @@
}
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void av1_convolve_2d_sr_horiz_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
- int im_h, const int16x8_t x_filter_s16, const int round_0) {
- const int bd = 8;
-
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
-
- int height = im_h;
-
- // Filter values are even, so downshift by 1 to reduce intermediate precision
- // requirements.
- const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
- const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
- // Dot product constants.
- const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 6);
- const int32x4_t correction =
- vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
- assert(round_0 > 0);
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
- uint8x16_t s0, s1, s2, s3;
- int32x4_t t0, t1, t2, t3;
- int16x4_t d0, d1, d2, d3;
-
- do {
- assert(height >= 4);
-
- load_u8_8x16(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
- t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
- permute_tbl);
- t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
- permute_tbl);
- t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
- permute_tbl);
- t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
- permute_tbl);
-
- d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
- d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
- d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
- d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
-
- store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
-
- do {
- s0 = vld1q_u8(src_ptr);
- t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
- permute_tbl);
- d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
-
- vst1_s16(dst_ptr, d0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
- uint8x16_t s0, s1, s2, s3;
- int16x8_t d0, d1, d2, d3;
-
- do {
- assert(height >= 4);
-
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- s0 = vld1q_u8(s + 0 * src_stride);
- s1 = vld1q_u8(s + 1 * src_stride);
- s2 = vld1q_u8(s + 2 * src_stride);
- s3 = vld1q_u8(s + 3 * src_stride);
-
- d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
-
- vst1q_s16(d + 0 * dst_stride, d0);
- vst1q_s16(d + 1 * dst_stride, d1);
- vst1q_s16(d + 2 * dst_stride, d2);
- vst1q_s16(d + 3 * dst_stride, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * im_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
-
- do {
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- do {
- s0 = vld1q_u8(s);
- d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- vst1q_s16(d, d0);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- height--;
- } while (height > 0);
- }
- }
-}
-
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
// Horizontal filtering for convolve_2d_sr for width multiple of 8
// Processes one row at a time
static INLINE void horiz_filter_w8_single_row(
@@ -1038,12 +892,10 @@
// Horizontal filtering for convolve_2d_sr for width <= 4
// Processes one row at a time
-static INLINE void horiz_filter_w4_single_row(const uint8_t *src_ptr,
- int src_stride, int16_t *dst_ptr,
- const int dst_stride, int height,
- const int16x8_t x_filter,
- const int16x4_t horiz_const,
- const int16x4_t shift_round_0) {
+static INLINE void horiz_filter_w4_single_row(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int width, int height, const int16x8_t x_filter,
+ const int16x4_t horiz_const, const int16x4_t shift_round_0) {
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
do {
const uint8_t *s = src_ptr;
@@ -1071,42 +923,83 @@
int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
horiz_const, shift_round_0);
- vst1_s16(dst_ptr, d0);
+ if (width == 4) {
+ vst1_s16(dst_ptr, d0);
+ dst_ptr += dst_stride;
+ } else if (width == 2) {
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+ dst_ptr += dst_stride;
+ }
- dst_ptr += dst_stride;
src_ptr += src_stride;
height--;
} while (height > 0);
}
-static INLINE void av1_convolve_2d_sr_horiz_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
- int im_h, const int16x8_t x_filter_s16, const int round_0) {
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
+ int im_dst_stride;
+ int width, height;
+#if defined(__aarch64__)
+ uint8x8_t t0;
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+ const uint8_t *s;
+#endif
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
const int bd = 8;
+ const int im_h = h + filter_params_y->taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- int height = im_h;
+ int16_t *dst_ptr;
+
+ dst_ptr = im_block;
+ im_dst_stride = im_stride;
+ height = im_h;
+ width = w;
+
+ const int16_t round_bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
// Filter values are even, so downshift by 1 to reduce intermediate precision
// requirements.
- const int16x8_t x_filter = vshrq_n_s16(x_filter_s16, 1);
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
- assert(round_0 > 0);
+ assert(conv_params->round_0 > 0);
- if (w == 4) {
+ if (w <= 4) {
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
- const int16x4_t shift_round_0 = vdup_n_s16(-(round_0 - 1));
+ const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
#if defined(__aarch64__)
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
do {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- uint8x8_t t0, t1, t2, t3;
- const uint8_t *s = src_ptr;
-
assert(height >= 4);
+ s = src_ptr;
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
@@ -1119,6 +1012,10 @@
s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
s += 7;
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
@@ -1139,44 +1036,56 @@
horiz_const, shift_round_0);
transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-
- vst1_s16((dst_ptr + 0 * dst_stride), d0);
- vst1_s16((dst_ptr + 1 * dst_stride), d1);
- vst1_s16((dst_ptr + 2 * dst_stride), d2);
- vst1_s16((dst_ptr + 3 * dst_stride), d3);
-
+ if (w == 4) {
+ vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+ vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+ vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+ vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+ } else if (w == 2) {
+ vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+ vreinterpret_u32_s16(d0), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+ vreinterpret_u32_s16(d1), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+ vreinterpret_u32_s16(d2), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+ vreinterpret_u32_s16(d3), 0);
+ }
src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
+ dst_ptr += 4 * im_dst_stride;
height -= 4;
} while (height >= 4);
if (height) {
assert(height < 4);
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride,
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
height, x_filter, horiz_const, shift_round_0);
}
-
-#else // !defined(__aarch64__)
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, dst_stride, height,
- x_filter, horiz_const, shift_round_0);
-#endif // defined(__aarch64__)
+#else
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+ height, x_filter, horiz_const, shift_round_0);
+#endif
} else {
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
- const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0 - 1));
+ const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
#if defined(__aarch64__)
+ int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+ do {
+ assert(height >= 8);
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
- for (; height >= 8; height -= 8) {
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
- d0, d1, d2, d3, d4, d5, d6, d7;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
-
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1188,7 +1097,18 @@
s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s += 7;
+ width = w;
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+
+ __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
do {
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1204,26 +1124,28 @@
s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
- d0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
- horiz_const, shift_round_0);
- d1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
- horiz_const, shift_round_0);
- d2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
- horiz_const, shift_round_0);
- d3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
- horiz_const, shift_round_0);
- d4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
- horiz_const, shift_round_0);
- d5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
- horiz_const, shift_round_0);
- d6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
- horiz_const, shift_round_0);
- d7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
- horiz_const, shift_round_0);
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const, shift_round_0);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ horiz_const, shift_round_0);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ horiz_const, shift_round_0);
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ horiz_const, shift_round_0);
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+ horiz_const, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+ horiz_const, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+ horiz_const, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, x_filter,
+ horiz_const, shift_round_0);
- transpose_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+ &res7);
- store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+ store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+ res6, res7);
s0 = s8;
s1 = s9;
@@ -1233,241 +1155,246 @@
s5 = s13;
s6 = s14;
s += 8;
- d += 8;
+ d_tmp += 8;
width -= 8;
} while (width > 0);
-
src_ptr += 8 * src_stride;
- dst_ptr += 8 * dst_stride;
- }
+ dst_ptr += 8 * im_dst_stride;
+ height -= 8;
+ } while (height >= 8);
- for (; height >= 4; height -= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
- dd0, dd1, dd2, dd3, dd4, dd5, dd6, dd7;
- int16x8_t d0, d1, d2, d3;
- uint8x8_t t0, t1, t2, t3;
+ if (height >= 4) {
+ assert(height < 8);
+ int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+ reg10, reg11, reg12, reg13, reg14;
+ int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ int16x8_t out0, out1, out2, out3;
- const uint8_t *s = src_ptr;
- int16_t *d = dst_ptr;
- int width = w;
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s += 7;
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+ width = w;
do {
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- dd0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
- dd1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
- dd2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
- dd3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
- dd4 = convolve8_4x4(s4, s5, s6, s7, s8, s9, s10, s11, x_filter);
- dd5 = convolve8_4x4(s5, s6, s7, s8, s9, s10, s11, s12, x_filter);
- dd6 = convolve8_4x4(s6, s7, s8, s9, s10, s11, s12, s13, x_filter);
- dd7 = convolve8_4x4(s7, s8, s9, s10, s11, s12, s13, s14, x_filter);
+ d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+ x_filter);
- transpose_s16_4x8(&dd0, &dd1, &dd2, &dd3, &dd4, &dd5, &dd6, &dd7, &d0,
- &d1, &d2, &d3);
+ d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+ x_filter);
- d0 = vaddq_s16(d0, horiz_const);
- d1 = vaddq_s16(d1, horiz_const);
- d2 = vaddq_s16(d2, horiz_const);
- d3 = vaddq_s16(d3, horiz_const);
+ d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+ x_filter);
- d0 = vqrshlq_s16(d0, shift_round_0);
- d1 = vqrshlq_s16(d1, shift_round_0);
- d2 = vqrshlq_s16(d2, shift_round_0);
- d3 = vqrshlq_s16(d3, shift_round_0);
+ d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+ x_filter);
- store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+ d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+ x_filter);
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
+ d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+ x_filter);
+
+ d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+ x_filter);
+
+ d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
+ x_filter);
+
+ transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
+ &out2, &out3);
+
+ out0 = vaddq_s16(out0, horiz_const);
+ out0 = vqrshlq_s16(out0, shift_round_0);
+
+ out1 = vaddq_s16(out1, horiz_const);
+ out1 = vqrshlq_s16(out1, shift_round_0);
+
+ out2 = vaddq_s16(out2, horiz_const);
+ out2 = vqrshlq_s16(out2, shift_round_0);
+
+ out3 = vaddq_s16(out3, horiz_const);
+ out3 = vqrshlq_s16(out3, shift_round_0);
+
+ store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+ reg0 = reg8;
+ reg1 = reg9;
+ reg2 = reg10;
+ reg3 = reg11;
+ reg4 = reg12;
+ reg5 = reg13;
+ reg6 = reg14;
s += 8;
- d += 8;
+ d_tmp += 8;
width -= 8;
} while (width > 0);
-
src_ptr += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
+ dst_ptr += 4 * im_dst_stride;
+ height -= 4;
}
if (height) {
assert(height < 4);
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
height, x_filter, horiz_const, shift_round_0);
}
+#else
-#else // !defined(__aarch64__)
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, dst_stride, w,
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
height, x_filter, horiz_const, shift_round_0);
-#endif // defined(__aarch64__)
+#endif
}
-}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+ // vertical
+ {
+ uint8_t *dst_u8_ptr, *d_u8;
+ int16_t *v_src_ptr, *v_s;
-static INLINE void av1_convolve_2d_sr_vert_neon(
- int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
- int h, const int16x8_t y_filter, ConvolveParams *conv_params) {
- const int bd = 8;
- const int16_t round_bits =
- FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
- const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
- const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
+ const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
- const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
- const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+ src_stride = im_stride;
+ v_src_ptr = im_block;
+ dst_u8_ptr = dst;
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
- int16x8_t dd0;
- uint8x8_t d01;
+ height = h;
+ width = w;
+
+ if (width <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t dd0;
+ uint8x8_t d01;
#if defined(__aarch64__)
- int16x4_t s8, s9, s10, d1, d2, d3;
- int16x8_t dd1;
- uint8x8_t d23;
-#endif // defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t dd1;
+ uint8x8_t d23;
+#endif
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
+ d_u8 = dst_u8_ptr;
+ v_s = v_src_ptr;
- load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- s += (7 * src_stride);
+ __builtin_prefetch(v_s + 0 * im_stride);
+ __builtin_prefetch(v_s + 1 * im_stride);
+ __builtin_prefetch(v_s + 2 * im_stride);
+ __builtin_prefetch(v_s + 3 * im_stride);
+ __builtin_prefetch(v_s + 4 * im_stride);
+ __builtin_prefetch(v_s + 5 * im_stride);
+ __builtin_prefetch(v_s + 6 * im_stride);
+ __builtin_prefetch(v_s + 7 * im_stride);
- do {
-#if defined(__aarch64__)
- load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
- s += (4 * src_stride);
-
- d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_shift_vec, offset_const, sub_const_vec);
- d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- round_shift_vec, offset_const, sub_const_vec);
- d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- round_shift_vec, offset_const, sub_const_vec);
- d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- round_shift_vec, offset_const, sub_const_vec);
-
- dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
- dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
-
- d01 = vqmovun_s16(dd0);
- d23 = vqmovun_s16(dd1);
-
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d23), 1);
- d += dst_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- h -= 4;
-#else // !defined(__aarch64__)
- s7 = vld1_s16(s);
- s += src_stride;
-
- d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_shift_vec, offset_const, sub_const_vec);
-
- dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
- d01 = vqmovun_s16(dd0);
-
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d01), 0);
- d += dst_stride;
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- h--;
-#endif // defined(__aarch64__)
- } while (h > 0);
- } else {
- // if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint8x8_t d0;
-#if defined(__aarch64__)
- int16x8_t s8, s9, s10;
- uint8x8_t d1, d2, d3;
-#endif // defined(__aarch64__)
-
- do {
- int height = h;
- int16_t *s = src_ptr;
- uint8_t *d = dst_ptr;
-
- load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- s += (7 * src_stride);
+ load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ v_s += (7 * im_stride);
do {
#if defined(__aarch64__)
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- s += (4 * src_stride);
+ load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+ v_s += (im_stride << 2);
- d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- d1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- d2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- d3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+ __builtin_prefetch(d_u8 + 1 * dst_stride);
+ __builtin_prefetch(d_u8 + 2 * dst_stride);
+ __builtin_prefetch(d_u8 + 3 * dst_stride);
- vst1_u8(d, d0);
- d += dst_stride;
- vst1_u8(d, d1);
- d += dst_stride;
- vst1_u8(d, d2);
- d += dst_stride;
- vst1_u8(d, d3);
- d += dst_stride;
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+
+ dd0 = vqrshlq_s16(vcombine_s16(d0, d1), vec_round_bits);
+ dd1 = vqrshlq_s16(vcombine_s16(d2, d3), vec_round_bits);
+
+ d01 = vqmovun_s16(dd0);
+ d23 = vqmovun_s16(dd1);
+
+ if ((w == 4) && (h != 2)) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+ 0); // 20 21 22 23
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+ 1); // 30 31 32 33
+ d_u8 += dst_stride;
+ } else if ((w == 2) && (h != 2)) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 2); // 10 11
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+ 0); // 20 21
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+ 2); // 30 31
+ d_u8 += dst_stride;
+ } else if ((w == 4) && (h == 2)) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ d_u8 += dst_stride;
+ } else if ((w == 2) && (h == 2)) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 2); // 10 11
+ d_u8 += dst_stride;
+ }
s0 = s4;
s1 = s5;
@@ -1477,16 +1404,29 @@
s5 = s9;
s6 = s10;
height -= 4;
-#else // !defined(__aarch64__)
- s7 = vld1q_s16(s);
- s += src_stride;
+#else
+ s7 = vld1_s16(v_s);
+ v_s += im_stride;
- d0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
+ sub_const_vec);
- vst1_u8(d, d0);
- d += dst_stride;
+ dd0 = vqrshlq_s16(vcombine_s16(d0, d0), vec_round_bits);
+ d01 = vqmovun_s16(dd0);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ }
s0 = s1;
s1 = s2;
@@ -1495,49 +1435,109 @@
s4 = s5;
s5 = s6;
s6 = s7;
- height--;
-#endif // defined(__aarch64__)
+ height -= 1;
+#endif
} while (height > 0);
+ } else {
+ // if width is a multiple of 8 & height is a multiple of 4
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t res1, res2, res3;
+#endif
- src_ptr += 8;
- dst_ptr += 8;
- w -= 8;
- } while (w > 0);
- }
-}
+ do {
+ __builtin_prefetch(v_src_ptr + 0 * im_stride);
+ __builtin_prefetch(v_src_ptr + 1 * im_stride);
+ __builtin_prefetch(v_src_ptr + 2 * im_stride);
+ __builtin_prefetch(v_src_ptr + 3 * im_stride);
+ __builtin_prefetch(v_src_ptr + 4 * im_stride);
+ __builtin_prefetch(v_src_ptr + 5 * im_stride);
+ __builtin_prefetch(v_src_ptr + 6 * im_stride);
+ __builtin_prefetch(v_src_ptr + 7 * im_stride);
-void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int subpel_y_qn,
- ConvolveParams *conv_params) {
- if (filter_params_x->taps > 8) {
- av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, subpel_x_qn,
- subpel_y_qn, conv_params);
- } else {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+ v_s = v_src_ptr;
+ load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ v_s += (7 * im_stride);
- const int im_h = h + filter_params_y->taps - 1;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ d_u8 = dst_u8_ptr;
+ height = h;
- const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
- const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
- const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ do {
+#if defined(__aarch64__)
+ load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+ v_s += (im_stride << 2);
- av1_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w,
- im_h, x_filter, conv_params->round_0);
+ __builtin_prefetch(d_u8 + 4 * dst_stride);
+ __builtin_prefetch(d_u8 + 5 * dst_stride);
+ __builtin_prefetch(d_u8 + 6 * dst_stride);
+ __builtin_prefetch(d_u8 + 7 * dst_stride);
- av1_convolve_2d_sr_vert_neon(im_block, im_stride, dst, dst_stride, w, h,
- y_filter, conv_params);
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ if (h != 2) {
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res1);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res2);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res3);
+ d_u8 += dst_stride;
+ } else {
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res1);
+ d_u8 += dst_stride;
+ }
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+#else
+ s7 = vld1q_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
+ } while (height > 0);
+ v_src_ptr += 8;
+ dst_u8_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
}
}
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index e8c2ad6..05d781e 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -230,75 +230,6 @@
return sum;
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
- 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
- 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
- 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-static INLINE int32x4_t convolve8_4_dot_s16(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
- /* Narrowing and packing is performed by the caller. */
- return sum;
-}
-
-static INLINE int16x8_t convolve8_8_dot_s16(uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl,
- const int16x8_t shift_round_0) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
- sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- return vqrshlq_s16(sum, shift_round_0);
-}
-
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
static INLINE int16x4_t convolve8_4x4_s16(
const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index ddf55bc..6f4f58e 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -281,108 +281,6 @@
}
}
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void dist_wtd_convolve_2d_horiz_neon(
- const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
- const int16x8_t x_filter_s16, const int im_h, int w, const int round_0) {
- const int bd = 8;
- int16_t *dst_ptr = im_block;
- int dst_stride = im_stride;
- int width = w;
- int height = im_h;
-
- const int8x8_t x_filter = vmovn_s16(x_filter_s16);
- const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
- // Dot product constants.
- const int16x8_t correct_tmp = vshlq_n_s16(x_filter_s16, 7);
- const int32x4_t correction =
- vdupq_n_s32(vaddlvq_s16(correct_tmp) + horiz_const);
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
- if (w == 4) {
- const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
- uint8x16_t s0, s1, s2, s3;
- int32x4_t t0, t1, t2, t3;
- int16x4_t d0, d1, d2, d3;
-
- do {
- s0 = vld1q_u8(src + 0 * src_stride);
- s1 = vld1q_u8(src + 1 * src_stride);
- s2 = vld1q_u8(src + 2 * src_stride);
- s3 = vld1q_u8(src + 3 * src_stride);
-
- t0 = convolve8_4_dot_s16(s0, x_filter, correction, range_limit,
- permute_tbl);
- t1 = convolve8_4_dot_s16(s1, x_filter, correction, range_limit,
- permute_tbl);
- t2 = convolve8_4_dot_s16(s2, x_filter, correction, range_limit,
- permute_tbl);
- t3 = convolve8_4_dot_s16(s3, x_filter, correction, range_limit,
- permute_tbl);
-
- d0 = vqrshl_s16(vmovn_s32(t0), shift_round_0);
- d1 = vqrshl_s16(vmovn_s32(t1), shift_round_0);
- d2 = vqrshl_s16(vmovn_s32(t2), shift_round_0);
- d3 = vqrshl_s16(vmovn_s32(t3), shift_round_0);
-
- vst1_s16((dst_ptr + 0 * dst_stride), d0);
- vst1_s16((dst_ptr + 1 * dst_stride), d1);
- vst1_s16((dst_ptr + 2 * dst_stride), d2);
- vst1_s16((dst_ptr + 3 * dst_stride), d3);
-
- src += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- } else {
- const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
- const uint8_t *s;
- int16_t *d;
- uint8x16_t s0, s1, s2, s3;
- int16x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst_ptr;
-
- do {
- s0 = vld1q_u8(s + 0 * src_stride);
- s1 = vld1q_u8(s + 1 * src_stride);
- s2 = vld1q_u8(s + 2 * src_stride);
- s3 = vld1q_u8(s + 3 * src_stride);
-
- d0 = convolve8_8_dot_s16(s0, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- d1 = convolve8_8_dot_s16(s1, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- d2 = convolve8_8_dot_s16(s2, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
- d3 = convolve8_8_dot_s16(s3, x_filter, correction, range_limit,
- permute_tbl, shift_round_0);
-
- vst1q_s16(d + 0 * dst_stride, d0);
- vst1q_s16(d + 1 * dst_stride, d1);
- vst1q_s16(d + 2 * dst_stride, d2);
- vst1q_s16(d + 3 * dst_stride, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
-
- src += 4 * src_stride;
- dst_ptr += 4 * dst_stride;
- height -= 4;
- } while (height > 0);
- }
-}
-
-#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
static INLINE void dist_wtd_convolve_2d_horiz_neon(
const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
const int16x8_t x_filter, const int im_h, int w, const int round_0) {
@@ -627,8 +525,6 @@
}
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
static INLINE void dist_wtd_convolve_2d_vert_neon(
int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {