Add explicit Neon implementation of cdef_copy_rect8 functions

Write an explicit Neon implementation of cdef_copy_rect8_8bit_to_16bit
and cdef_copy_rect8_16bit_to_16bit. This is the last patch to translate
cdef functions from the architecture agnostic layer to explicit Neon
intrinsics.

Change-Id: I00ee7f7c130394f586e0405ffa499aa78fc8b910
diff --git a/av1/common/arm/cdef_block_neon.c b/av1/common/arm/cdef_block_neon.c
index 4465e0b..1bcf6a1 100644
--- a/av1/common/arm/cdef_block_neon.c
+++ b/av1/common/arm/cdef_block_neon.c
@@ -9,25 +9,66 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#include "aom_dsp/arm/mem_neon.h"
+#include <arm_neon.h>
+#include <assert.h>
 
-#define SIMD_FUNC(name) name##_neon
-#include "av1/common/cdef_block_simd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/cdef_block.h"
 
 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
                                         const uint8_t *src, int sstride,
                                         int width, int height) {
-  int j;
-  for (int i = 0; i < height; i++) {
-    for (j = 0; j < (width & ~0x7); j += 8) {
-      v64 row = v64_load_unaligned(&src[i * sstride + j]);
-      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+  do {
+    const uint8_t *src_ptr = src;
+    uint16_t *dst_ptr = dst;
+
+    int w = 0;
+    while (w <= width - 16) {
+      uint8x16_t row = vld1q_u8(src_ptr + w);
+      uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
+      vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
+
+      w += 16;
     }
-    for (; j < width; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
+    if (width - w == 8) {
+      uint8x8_t row = vld1_u8(src_ptr + w);
+      vst1q_u16(dst_ptr + w, vmovl_u8(row));
+    } else if (width - w == 4) {
+      for (int i = 0; i < 4; i++) {
+        dst_ptr[i] = src_ptr[i];
+      }
     }
-  }
+
+    src += sstride;
+    dst += dstride;
+  } while (--height != 0);
+}
+
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
+                                         const uint16_t *src, int sstride,
+                                         int width, int height) {
+  do {
+    const uint16_t *src_ptr = src;
+    uint16_t *dst_ptr = dst;
+
+    int w = 0;
+    while (width - w >= 8) {
+      uint16x8_t row = vld1q_u16(src_ptr + w);
+      vst1q_u16(dst_ptr + w, row);
+
+      w += 8;
+    }
+    if (width == 4) {
+      uint16x4_t row = vld1_u16(src_ptr);
+      vst1_u16(dst_ptr, row);
+    }
+
+    src += sstride;
+    dst += dstride;
+  } while (--height != 0);
 }
 
 static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) {
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index e86aa75..5c62201 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -158,9 +158,6 @@
   res[0] = v128_ziphi_64(tr1_7, tr1_6);
 }
 
-// There is a separate Neon implementation of this function, so disable this
-// one.
-#if !HAVE_NEON
 int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
                              int coeff_shift) {
   int i;
@@ -199,7 +196,6 @@
   *var >>= 10;
   return best_dir;
 }
-#endif
 
 // Work around compiler out of memory issues with Win32 builds. This issue has
 // been observed with Visual Studio 2017, 2019, and 2022 (version 17.4).
@@ -209,9 +205,6 @@
 #define CDEF_INLINE SIMD_INLINE
 #endif
 
-// There is a separate Neon implementation of these functions, so disable this
-// one.
-#if !HAVE_NEON
 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
 CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
                              unsigned int adjdamp) {
@@ -830,7 +823,6 @@
     copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
   }
 }
-#endif  // HAVE_NEON
 
 void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
                                                const uint16_t *src, int sstride,