Fix some UBSan warnings

* Make intermediate arrays in av1(_highbd)_warp_affine_c signed,
  to avoid integer overflow when multiplying an 'unsigned int'
  by a negative 'int' value.

* Pad out arrays in masked_variance_test.cc so that the array
  stride is a multiple of 16 bytes.
  This fixes some UBSan errors in masked_variance_intrin_ssse3.c
  related to unaligned loads of 32-bit values.

BUG=aomedia:572

Change-Id: I0cf786c94870ff128c883bed8e900b0686afc3f7
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 6158b15..5bacd05 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -954,7 +954,7 @@
                               int subsampling_y, int bd, int comp_avg,
                               int16_t alpha, int16_t beta, int16_t gamma,
                               int16_t delta) {
-  uint32_t tmp[15 * 8];
+  int32_t tmp[15 * 8];
   int i, j, k, l, m;
 
   for (i = p_row; i < p_row + p_height; i += 8) {
@@ -1250,7 +1250,7 @@
                        int subsampling_x, int subsampling_y, int comp_avg,
                        int16_t alpha, int16_t beta, int16_t gamma,
                        int16_t delta) {
-  uint16_t tmp[15 * 8];
+  int32_t tmp[15 * 8];
   int i, j, k, l, m;
   const int bd = 8;
 
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 979335d..a9cbdc8 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -59,30 +59,25 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  // Note: We pad the input arrays out with 15 extra elements, since the SSE
-  // implementations can read up to 15 elements off the end of the main data.
-  // The extra data is never actually used, but it simplifies the code
-  // if we can do this.
+  // Note: We pad out the input array to a multiple of 16 bytes wide, so that
+  // consecutive rows keep the 16-byte alignment.
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
   DECLARE_ALIGNED(16, uint8_t,
-                  src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SB_SIZE + 1);
-  int ref_stride = (MAX_SB_SIZE + 1);
-  int msk_stride = (MAX_SB_SIZE + 1);
+  int src_stride = (MAX_SB_SIZE + 16);
+  int ref_stride = (MAX_SB_SIZE + 16);
+  int msk_stride = (MAX_SB_SIZE + 16);
   int xoffset;
   int yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
     int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
-    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16); j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       second_pred_ptr[j] = rnd.Rand8();
@@ -120,33 +115,30 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
   DECLARE_ALIGNED(16, uint8_t,
-                  src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 15]);
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SB_SIZE + 1);
-  int ref_stride = (MAX_SB_SIZE + 1);
-  int msk_stride = (MAX_SB_SIZE + 1);
+  int src_stride = (MAX_SB_SIZE + 16);
+  int ref_stride = (MAX_SB_SIZE + 16);
+  int msk_stride = (MAX_SB_SIZE + 16);
 
   for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
     for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
       for (int i = 0; i < 16; ++i) {
         memset(src_ptr, (i & 0x1) ? 255 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
         memset(ref_ptr, (i & 0x2) ? 255 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
         memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
         memset(msk_ptr, (i & 0x8) ? 64 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
 
         for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
           ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
@@ -204,18 +196,11 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  // Note: We pad the input arrays out with 7 extra elements, since the SSE
-  // implementations can read up to 7 elements off the end of the main data.
-  // The extra data is never actually used, but it simplifies the code
-  // if we can do this.
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
   DECLARE_ALIGNED(16, uint16_t,
-                  src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
-  DECLARE_ALIGNED(16, uint16_t,
-                  ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
-  DECLARE_ALIGNED(16, uint16_t,
-                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
   uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
@@ -223,13 +208,13 @@
   int first_failure = -1;
   int first_failure_x = -1;
   int first_failure_y = -1;
-  int src_stride = (MAX_SB_SIZE + 1);
-  int ref_stride = (MAX_SB_SIZE + 1);
-  int msk_stride = (MAX_SB_SIZE + 1);
+  int src_stride = (MAX_SB_SIZE + 8);
+  int ref_stride = (MAX_SB_SIZE + 8);
+  int msk_stride = (MAX_SB_SIZE + 8);
   int xoffset, yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8); j++) {
       src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
@@ -270,14 +255,11 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
   DECLARE_ALIGNED(16, uint16_t,
-                  src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
-  DECLARE_ALIGNED(16, uint16_t,
-                  ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
-  DECLARE_ALIGNED(16, uint16_t,
-                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1) + 7]);
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
   uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
@@ -285,21 +267,21 @@
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SB_SIZE + 1);
-  int ref_stride = (MAX_SB_SIZE + 1);
-  int msk_stride = (MAX_SB_SIZE + 1);
+  int src_stride = (MAX_SB_SIZE + 8);
+  int ref_stride = (MAX_SB_SIZE + 8);
+  int msk_stride = (MAX_SB_SIZE + 8);
 
   for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
     for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
       for (int i = 0; i < 16; ++i) {
         aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
         aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
         aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
         memset(msk_ptr, (i & 0x8) ? 64 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
 
         for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
           ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,