cdef_block_neon: fix cdef_copy_rect8*

after:
3c9bbf8002 Add explicit Neon implementation of cdef_copy_rect8 functions

In these functions, width is a multiple of 4, but may not be the width
of a block, e.g., 12 or 20.

Change-Id: I94e40bc0b92f2bccdc667a3ceb27b2a4cb5f6bfb
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 1bcf6a1..c4a1acb 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -33,11 +33,13 @@
 
       w += 16;
     }
-    if (width - w == 8) {
+    if (width - w >= 8) {
       uint8x8_t row = vld1_u8(src_ptr + w);
       vst1q_u16(dst_ptr + w, vmovl_u8(row));
-    } else if (width - w == 4) {
-      for (int i = 0; i < 4; i++) {
+      w += 8;
+    }
+    if (width - w == 4) {
+      for (int i = w; i < w + 4; i++) {
         dst_ptr[i] = src_ptr[i];
       }
     }
@@ -61,9 +63,9 @@
 
       w += 8;
     }
-    if (width == 4) {
-      uint16x4_t row = vld1_u16(src_ptr);
-      vst1_u16(dst_ptr, row);
+    if (width - w == 4) {
+      uint16x4_t row = vld1_u16(src_ptr + w);
+      vst1_u16(dst_ptr + w, row);
     }
 
     src += sstride;