cdef_block_neon: fix cdef_copy_rect8*
after:
3c9bbf8002 Add explicit Neon implementation of cdef_copy_rect8 functions
In these functions, width is a multiple of 4, but may not be the width
of a block, e.g., 12 or 20.
Change-Id: I94e40bc0b92f2bccdc667a3ceb27b2a4cb5f6bfb
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 1bcf6a1..c4a1acb 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -33,11 +33,13 @@
w += 16;
}
- if (width - w == 8) {
+ if (width - w >= 8) {
uint8x8_t row = vld1_u8(src_ptr + w);
vst1q_u16(dst_ptr + w, vmovl_u8(row));
- } else if (width - w == 4) {
- for (int i = 0; i < 4; i++) {
+ w += 8;
+ }
+ if (width - w == 4) {
+ for (int i = w; i < w + 4; i++) {
dst_ptr[i] = src_ptr[i];
}
}
@@ -61,9 +63,9 @@
w += 8;
}
- if (width == 4) {
- uint16x4_t row = vld1_u16(src_ptr);
- vst1_u16(dst_ptr, row);
+ if (width - w == 4) {
+ uint16x4_t row = vld1_u16(src_ptr + w);
+ vst1_u16(dst_ptr + w, row);
}
src += sstride;