replace per-element copy with memcpy

memcpy should already include SSE/AVX optimization.

Observed 0.3% encoding time reduction measured against
two-pass encoding of 720p at speed-5.

Change-Id: I7520984b65de32c4b899336d82c1e3c444ab6153
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index beada59..d3ae348 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -165,17 +165,13 @@
   return best_tot_mse;
 }
 
-/* FIXME: SSE-optimize this. */
 static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
                          int src_voffset, int src_hoffset, int sstride,
                          int vsize, int hsize) {
-  int r, c;
+  int r;
   const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
-  for (r = 0; r < vsize; r++) {
-    for (c = 0; c < hsize; c++) {
-      dst[r * dstride + c] = base[r * sstride + c];
-    }
-  }
+  for (r = 0; r < vsize; r++)
+    memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
 }
 
 #if CONFIG_DIST_8X8