Add explicit Neon implementation of cdef_copy_rect8 functions
Write an explicit Neon implementation of cdef_copy_rect8_8bit_to_16bit
and cdef_copy_rect8_16bit_to_16bit. This is the last patch to translate
cdef functions from the architecture agnostic layer to explicit Neon
intrinsics.
Change-Id: I00ee7f7c130394f586e0405ffa499aa78fc8b910
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 4465e0b..1bcf6a1 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -9,25 +9,66 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "aom_dsp/aom_simd.h"
-#include "aom_dsp/arm/mem_neon.h"
+#include <arm_neon.h>
+#include <assert.h>
-#define SIMD_FUNC(name) name##_neon
-#include "av1/common/cdef_block_simd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/cdef_block.h"
void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
const uint8_t *src, int sstride,
int width, int height) {
- int j;
- for (int i = 0; i < height; i++) {
- for (j = 0; j < (width & ~0x7); j += 8) {
- v64 row = v64_load_unaligned(&src[i * sstride + j]);
- v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ do {
+ const uint8_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+
+ int w = 0;
+ while (w <= width - 16) {
+ uint8x16_t row = vld1q_u8(src_ptr + w);
+ uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
+ vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
+
+ w += 16;
}
- for (; j < width; j++) {
- dst[i * dstride + j] = src[i * sstride + j];
+ if (width - w == 8) {
+ uint8x8_t row = vld1_u8(src_ptr + w);
+ vst1q_u16(dst_ptr + w, vmovl_u8(row));
+ } else if (width - w == 4) {
+ for (int i = 0; i < 4; i++) {
+ dst_ptr[i] = src_ptr[i];
+ }
}
- }
+
+ src += sstride;
+ dst += dstride;
+ } while (--height != 0);
+}
+
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride,
+ int width, int height) {
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+
+ int w = 0;
+ while (width - w >= 8) {
+ uint16x8_t row = vld1q_u16(src_ptr + w);
+ vst1q_u16(dst_ptr + w, row);
+
+ w += 8;
+ }
+ if (width == 4) {
+ uint16x4_t row = vld1_u16(src_ptr);
+ vst1_u16(dst_ptr, row);
+ }
+
+ src += sstride;
+ dst += dstride;
+ } while (--height != 0);
}
static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) {
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index e86aa75..5c62201 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -158,9 +158,6 @@
res[0] = v128_ziphi_64(tr1_7, tr1_6);
}
-// There is a separate Neon implementation of this function, so disable this
-// one.
-#if !HAVE_NEON
int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
int coeff_shift) {
int i;
@@ -199,7 +196,6 @@
*var >>= 10;
return best_dir;
}
-#endif
// Work around compiler out of memory issues with Win32 builds. This issue has
// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4).
@@ -209,9 +205,6 @@
#define CDEF_INLINE SIMD_INLINE
#endif
-// There is a separate Neon implementation of these functions, so disable this
-// one.
-#if !HAVE_NEON
// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
unsigned int adjdamp) {
@@ -830,7 +823,6 @@
copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
}
}
-#endif // HAVE_NEON
void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
const uint16_t *src, int sstride,