Fix temp buffer size in ssse3/neon scaling

The temp buffer might not be big enough for uv plane.

Change-Id: I225abff4e7e6e17f1b2ccb38c544516a5dc20ebe
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index b570e15..86c0c3f 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c
@@ -746,6 +746,8 @@
     const int src_h = src->crop_heights[is_uv];
     const int dst_w = dst->crop_widths[is_uv];
     const int dst_h = dst->crop_heights[is_uv];
+    const int dst_y_w = dst->crop_widths[0];
+    const int dst_y_h = dst->crop_heights[0];
 
     if (2 * dst_w == src_w && 2 * dst_h == src_h) {
       if (phase == 0) {
@@ -759,8 +761,8 @@
                                     dst->buffers[i], dst->strides[is_uv], dst_w,
                                     dst_h, c0, c1);
       } else {
-        const int buffer_stride = (dst_w + 3) & ~3;
-        const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        const int buffer_stride = (dst_y_w + 3) & ~3;
+        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height);
         if (temp_buffer) {
@@ -786,8 +788,8 @@
                                     dst->buffers[i], dst->strides[is_uv], dst_w,
                                     dst_h, c0, c1);
       } else {
-        const int buffer_stride = (dst_w + 1) & ~1;
-        const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        const int buffer_stride = (dst_y_w + 1) & ~1;
+        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height);
         if (temp_buffer) {
@@ -803,8 +805,8 @@
       }
     } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
       // 4 to 3
-      const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
-      const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+      const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
       uint8_t *const temp_buffer =
           (uint8_t *)malloc(buffer_stride * buffer_height);
       if (temp_buffer) {
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index 1b25f46..0878baa 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c
@@ -850,8 +850,11 @@
     const int is_uv = i > 0;
     const int src_w = src->crop_widths[is_uv];
     const int src_h = src->crop_heights[is_uv];
+    const int src_y_w = src->crop_widths[0];
     const int dst_w = dst->crop_widths[is_uv];
     const int dst_h = dst->crop_heights[is_uv];
+    const int dst_y_w = dst->crop_widths[0];
+    const int dst_y_h = dst->crop_heights[0];
 
     if (2 * dst_w == src_w && 2 * dst_h == src_h) {
       // 2 to 1
@@ -867,8 +870,8 @@
                                     dst->buffers[i], dst->strides[is_uv], dst_w,
                                     dst_h, c0c1);
       } else {
-        const int buffer_stride = (dst_w + 3) & ~3;
-        const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        const int buffer_stride = (dst_y_w + 3) & ~3;
+        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
         uint8_t *const temp_buffer =
             (uint8_t *)malloc(buffer_stride * buffer_height);
         if (temp_buffer) {
@@ -896,8 +899,8 @@
                                     dst->buffers[i], dst->strides[is_uv], dst_w,
                                     dst_h, c0c1);
       } else {
-        const int buffer_stride = (dst_w + 1) & ~1;
-        const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        const int buffer_stride = (dst_y_w + 1) & ~1;
+        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
         // When dst_w is 1 or 2, we need extra padding to avoid heap read
         // overflow
         const int extra_padding = 16;
@@ -916,9 +919,9 @@
       }
     } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
       // 4 to 3
-      const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
-      const int buffer_stride_ver = (dst_w + 7) & ~7;
-      const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+      const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+      const int buffer_stride_ver = (dst_y_w + 7) & ~7;
+      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
       // When the vertical filter reads more pixels than the horizontal filter
       // generated in each row, we need extra padding to avoid heap read
       // overflow. For example, the horizontal filter generates 18 pixels but
@@ -942,7 +945,7 @@
       }
     } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
       // 1 to 2
-      uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+      uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
       if (temp_buffer) {
         const InterpKernel *interp_kernel =
             (const InterpKernel *)av1_interp_filter_params_list[filter]