Add SIMD code for block copies.

Change-Id: I696da03fb5e9e87d054a9aa9238ad96937a0e281
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 0f47410..3713e11 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -628,7 +628,15 @@
   add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
   add_proto qw/int od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
   add_proto qw/int od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
-  # VS compiling for 32 bit targets does not support vector types in
+
+  add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+  add_proto qw/void copy_nxm_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int n, int m";
+  add_proto qw/void copy_nxm_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int n, int m";
+
+# VS compiling for 32 bit targets does not support vector types in
   # structs as arguments, which makes the v256 type of the intrinsics
   # hard to support, so optimizations for this target are disabled.
   if ($opts{config} !~ /libs-x86-win32-vs.*/) {
@@ -639,6 +647,13 @@
     specialize qw/od_dir_find8 sse2 ssse3 sse4_1 neon/;
     specialize qw/od_filter_dering_direction_4x4 sse2 ssse3 sse4_1 neon/;
     specialize qw/od_filter_dering_direction_8x8 sse2 ssse3 sse4_1 neon/;
+
+    specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_nxm_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_nxm_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
   }
 }
 
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index d8f40b9..5f0d30a 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -78,28 +78,38 @@
   return count;
 }
 
-/* TODO: Optimize this function for SSE. */
-static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
-                        const uint8_t *src, int src_voffset, int src_hoffset,
-                        int sstride, int vsize, int hsize) {
-  int r, c;
+void copy_nxm_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
+                              int sstride, int n, int m) {
+  int i, j;
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void copy_nxm_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
+                               int sstride, int n, int m) {
+  int i, j;
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+                 const uint8_t *src, int src_voffset, int src_hoffset,
+                 int sstride, int vsize, int hsize) {
 #if CONFIG_AOM_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
-    for (r = 0; r < vsize; r++) {
-      for (c = 0; c < hsize; c++) {
-        dst[r * dstride + c] = base[r * sstride + c];
-      }
-    }
+    copy_nxm_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
   } else {
 #endif
     const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
-    for (r = 0; r < vsize; r++) {
-      for (c = 0; c < hsize; c++) {
-        dst[r * dstride + c] = base[r * sstride + c];
-      }
-    }
+    copy_nxm_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
 #if CONFIG_AOM_HIGHBITDEPTH
   }
 #endif
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 1833d33..2432ec8 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -203,21 +203,20 @@
   return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
 }
 
-static INLINE void copy_8x8_16bit_to_16bit(uint16_t *dst, int dstride,
-                                           uint16_t *src, int sstride) {
+void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
+                               int sstride) {
   int i, j;
   for (i = 0; i < 8; i++)
     for (j = 0; j < 8; j++) dst[i * dstride + j] = src[i * sstride + j];
 }
 
-static INLINE void copy_4x4_16bit_to_16bit(uint16_t *dst, int dstride,
-                                           uint16_t *src, int sstride) {
+void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
+                               int sstride) {
   int i, j;
   for (i = 0; i < 4; i++)
     for (j = 0; j < 4; j++) dst[i * dstride + j] = src[i * sstride + j];
 }
 
-/* TODO: Optimize this function for SSE. */
 void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                                 dering_list *dlist, int dering_count,
                                 BLOCK_SIZE bsize) {
@@ -243,26 +242,25 @@
   }
 }
 
-static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride,
-                                          uint16_t *src, int sstride) {
+void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
+                              int sstride) {
   int i, j;
   for (i = 0; i < 8; i++)
     for (j = 0; j < 8; j++)
       dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
 }
 
-static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride,
-                                          uint16_t *src, int sstride) {
+void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
+                              int sstride) {
   int i, j;
   for (i = 0; i < 4; i++)
     for (j = 0; j < 4; j++)
       dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
 }
 
-/* TODO: Optimize this function for SSE. */
-static void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src,
-                                      dering_list *dlist, int dering_count,
-                                      int bsize) {
+static void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride,
+                                      const uint16_t *src, dering_list *dlist,
+                                      int dering_count, int bsize) {
   int bi, bx, by;
   if (bsize == 3) {
     for (bi = 0; bi < dering_count; bi++) {
diff --git a/av1/common/od_dering_simd.h b/av1/common/od_dering_simd.h
index f58dac1..d1a0cf5 100644
--- a/av1/common/od_dering_simd.h
+++ b/av1/common/od_dering_simd.h
@@ -366,3 +366,71 @@
   }
   return (int)((v128_dotp_s16(total_abs, v128_dup_16(1)) + 8) >> 4);
 }
+
+void SIMD_FUNC(copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride,
+                                       const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 8; i++) {
+    v128 row = v128_load_unaligned(&src[i * sstride]);
+    row = v128_pack_s16_u8(row, row);
+    v64_store_unaligned(&dst[i * dstride], v128_low_v64(row));
+  }
+}
+
+void SIMD_FUNC(copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride,
+                                       const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 4; i++) {
+    v128 row = v128_load_unaligned(&src[i * sstride]);
+    row = v128_pack_s16_u8(row, row);
+    u32_store_unaligned(&dst[i * dstride], v128_low_u32(row));
+  }
+}
+
+void SIMD_FUNC(copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                        const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 8; i++) {
+    v128 row = v128_load_unaligned(&src[i * sstride]);
+    v128_store_unaligned(&dst[i * dstride], row);
+  }
+}
+
+void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                        const uint16_t *src, int sstride) {
+  int i;
+  for (i = 0; i < 4; i++) {
+    v64 row = v64_load_unaligned(&src[i * sstride]);
+    v64_store_unaligned(&dst[i * dstride], row);
+  }
+}
+
+void SIMD_FUNC(copy_nxm_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                       const uint8_t *src, int sstride, int n,
+                                       int m) {
+  int i, j;
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < (n & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < n; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void SIMD_FUNC(copy_nxm_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                        const uint16_t *src, int sstride, int n,
+                                        int m) {
+  int i, j;
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < (n & ~0x7); j += 8) {
+      v128 row = v128_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], row);
+    }
+    for (; j < n; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}