cdef_block_neon: fix cdef_copy_rect8* after: 3c9bbf8002 Add explicit Neon implementation of cdef_copy_rect8 functions In these functions, width is a multiple of 4, but may not be the width of a block, e.g., 12 or 20. Change-Id: I94e40bc0b92f2bccdc667a3ceb27b2a4cb5f6bfb
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c index 1bcf6a1..c4a1acb 100644 --- a/av1/common/arm/cdef_block_neon.c +++ b/av1/common/arm/cdef_block_neon.c
@@ -33,11 +33,13 @@ w += 16; } - if (width - w == 8) { + if (width - w >= 8) { uint8x8_t row = vld1_u8(src_ptr + w); vst1q_u16(dst_ptr + w, vmovl_u8(row)); - } else if (width - w == 4) { - for (int i = 0; i < 4; i++) { + w += 8; + } + if (width - w == 4) { + for (int i = w; i < w + 4; i++) { dst_ptr[i] = src_ptr[i]; } } @@ -61,9 +63,9 @@ w += 8; } - if (width == 4) { - uint16x4_t row = vld1_u16(src_ptr); - vst1_u16(dst_ptr, row); + if (width - w == 4) { + uint16x4_t row = vld1_u16(src_ptr + w); + vst1_u16(dst_ptr + w, row); } src += sstride;