Fix memory over-read issue in av1_resize_horz_dir() SIMD

This CL fixes the test failures under 32-bit valgrind due
to memory over-read issue reported in Bug: aomedia:3575.
To fix this issue pixel overloading at frame boundary is
avoided. Also av1_resize_horz_dir_sse2() is enabled.

Bug: aomedia:3575
Change-Id: I50d87adb033c7e2cab036d66d49c11b5b81469ca
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7c4f539..7fa8b4e 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -558,9 +558,7 @@
 specialize qw/av1_resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
-# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
-# passes under 32-bit valgrind.
-specialize qw/av1_resize_horz_dir avx2/;
+specialize qw/av1_resize_horz_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 425c9f4..9c8958e 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -17,6 +17,7 @@
 
 #include "aom_dsp/x86/synonyms.h"
 
+#define ROW_OFFSET 5
 #define CAST_HI(x) _mm256_castsi128_si256(x)
 #define CAST_LOW(x) _mm256_castsi256_si128(x)
 
@@ -122,7 +123,7 @@
   filter_offset = 3;                                                           \
                                                                                \
   /* Pad start pixels to the left, while processing the first pixels in the    \
-    row. */                                                                    \
+   * row. */                                                                   \
   if (j == 0) {                                                                \
     /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
     row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
@@ -131,21 +132,24 @@
     r0 = row0;                                                                 \
     r1 = row1;                                                                 \
   }                                                                            \
-                                                                               \
+  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+  /* Avoid loading extra pixels at frame boundary.*/                           \
+  if (is_last_cols32) row_offset = ROW_OFFSET;                                 \
   /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
   __m128i row0_0 = _mm_loadl_epi64(                                            \
-      (__m128i *)&input[i * in_stride + 32 + j - filter_offset]);              \
+      (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \
   /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
-  __m128i row1_0 = _mm_loadl_epi64(                                            \
-      (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]);        \
+  __m128i row1_0 =                                                             \
+      _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j -         \
+                                        filter_offset - row_offset]);          \
   __m256i r2 = _mm256_permute2x128_si256(                                      \
       _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
                                                                                \
   /* Pad end pixels to the right, while processing the last pixels in the      \
-  row. */                                                                      \
-  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+   * row. */                                                                   \
   if (is_last_cols32) {                                                        \
-    r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask);                           \
+    r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET),                \
+                             wd32_end_pad_mask);                               \
   }                                                                            \
                                                                                \
   /* Process even pixels of the first row  */                                  \
@@ -169,7 +173,8 @@
   s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
                                                                                \
   /* The register res_out_0 stores the result of start-16 pixels corresponding \
-to the first and second rows whereas res_out_1 stores the end-16 pixels. */    \
+   * to the first and second rows whereas res_out_1 stores the end-16          \
+   * pixels. */                                                                \
   __m256i res_out_0[2], res_out_1[2];                                          \
   res_out_1[0] = res_out_1[1] = zero;                                          \
   res_out_0[0] = res_out_0[1] = zero;                                          \
@@ -184,7 +189,7 @@
   /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
   __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
                                                                                \
-  /* result of 32 pixels of row1 (b0 to b32) */                                \
+  /* Result of 32 pixels of row1 (b0 to b32) */                                \
   res_out_0[1] = _mm256_sra_epi32(                                             \
       _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
   res_out_1[1] = _mm256_sra_epi32(                                             \
@@ -530,12 +535,10 @@
                               uint8_t *intbuf, int height, int filtered_length,
                               int width2) {
   assert(height % 2 == 0);
-  // Invoke C for width less than 32.
-  // TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
-  // passes under 32-bit valgrind.
+  // Invoke SSE2 for width less than 32.
   if (filtered_length < 32) {
-    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
-                          width2);
+    av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
+                             width2);
     return;
   }
 
@@ -569,6 +572,7 @@
   if (filtered_length % 32 == 0) {
     for (int i = 0; i < height; i += 2) {
       int filter_offset = 0;
+      int row_offset = 0;
       for (int j = 0; j < filtered_length; j += 32) {
         PROCESS_RESIZE_X_WD32
       }
@@ -576,28 +580,50 @@
   } else {
     for (int i = 0; i < height; i += 2) {
       int filter_offset = 0;
-      int remain_col = filtered_length % 32;
-      for (int j = 0; j + 32 <= filtered_length; j += 32) {
+      int remain_col = filtered_length;
+      int row_offset = 0;
+      // To avoid pixel over-read at frame boundary, processing of 32 pixels
+      // is done using the core loop only if sufficient number of pixels
+      // required for the load are present. The remaining pixels are processed
+      // separately.
+      for (int j = 0; j <= filtered_length - 32; j += 32) {
+        if (remain_col == 34 || remain_col == 36) {
+          break;
+        }
         PROCESS_RESIZE_X_WD32
+        remain_col -= 32;
       }
 
       int wd_processed = filtered_length - remain_col;
-      if (remain_col > 15) {
-        remain_col = filtered_length % 16;
-        const int in_idx = i * in_stride + wd_processed - filter_offset;
+      // To avoid pixel over-read at frame boundary, processing of 16 pixels
+      // is done only if sufficient number of pixels required for the
+      // load are present. The remaining pixels are processed separately.
+      if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
+        remain_col = filtered_length - wd_processed - 16;
+        const int in_idx = i * in_stride + wd_processed;
         const int out_idx = (i * dst_stride) + wd_processed / 2;
         // a0 a1 --- a15
-        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        __m128i row0 =
+            _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]);
         // b0 b1 --- b15
-        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        __m128i row1 = _mm_loadu_si128(
+            (__m128i *)&input[in_idx + in_stride - filter_offset]);
         // a0 a1 --- a15 || b0 b1 --- b15
         __m256i r0 =
             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+        if (filter_offset == 0) {
+          r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
+        }
+        filter_offset = 3;
+        const int is_last_cols16 = wd_processed + 16 == filtered_length;
+        if (is_last_cols16) row_offset = ROW_OFFSET;
 
         // a16 a17 --- a23
-        row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
+        row0 = _mm_loadl_epi64(
+            (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]);
         // b16 b17 --- b23
-        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
+        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride -
+                                                 row_offset - filter_offset]);
 
         // a16-a23 x x x x| b16-b23 x x x x
         __m256i r1 =
@@ -605,9 +631,9 @@
 
         // Pad end pixels to the right, while processing the last pixels in the
         // row.
-        const int is_last_cols16 = wd_processed + 16 == filtered_length;
         if (is_last_cols16) {
-          r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
+          r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
+                                   wd32_end_pad_mask);
         }
 
         // a0 a1 --- a15 || b0 b1 --- b15
@@ -624,7 +650,7 @@
         res_out_0[0] = res_out_0[1] = zero;
         resize_convolve(s0, coeffs_x, res_out_0);
 
-        // r00 -r07
+        // r00-r07
         res_out_0[0] = _mm256_sra_epi32(
             _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
         // r10-r17
@@ -647,23 +673,30 @@
                          _mm_unpackhi_epi64(low_result, low_result));
       }
 
+      // To avoid pixel over-read at frame boundary, processing of 8 pixels
+      // is done only if sufficient number of pixels required for the
+      // load are present. The remaining pixels are processed by C function.
       wd_processed = filtered_length - remain_col;
-      if (remain_col > 7) {
-        remain_col = filtered_length % 8;
+      if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
+        remain_col = filtered_length - wd_processed - 8;
         const int in_idx = i * in_stride + wd_processed - filter_offset;
         const int out_idx = (i * dst_stride) + wd_processed / 2;
+        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
+        if (is_last_cols_8) row_offset = ROW_OFFSET;
         // a0 a1 --- a15
-        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]);
         // b0 b1 --- b15
-        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        __m128i row1 =
+            _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]);
         // a0 a1 --- a15 || b0 b1 --- b15
         __m256i r0 =
             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
 
         // Pad end pixels to the right, while processing the last pixels in the
         // row.
-        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
-        if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
+        if (is_last_cols_8)
+          r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
+                                   wd8_end_pad_mask);
 
         // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
         s0[0] = r0;
@@ -673,6 +706,7 @@
         s0[2] = _mm256_bsrli_epi128(r0, 4);
         // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
         s0[3] = _mm256_bsrli_epi128(r0, 6);
+
         __m256i res_out_0[2];
         res_out_0[0] = res_out_0[1] = zero;
         resize_convolve(s0, coeffs_x, res_out_0);
@@ -696,10 +730,6 @@
       }
 
       wd_processed = filtered_length - remain_col;
-      // When the remaining width is 2, the above code would not have taken
-      // care of padding required for (filtered_length - 4)th pixel. Hence,
-      // process that pixel again with the C code.
-      wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
       if (remain_col) {
         const int in_idx = (in_stride * i);
         const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 6b34ceb..e2d84da 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -16,6 +16,8 @@
 
 #include "aom_dsp/x86/synonyms.h"
 
+#define ROW_OFFSET 5
+
 #define PROCESS_RESIZE_Y_WD8                                           \
   /* ah0 ah1 ... ah7 */                                                \
   const __m128i AH = _mm_add_epi16(l0, l7);                            \
@@ -200,7 +202,6 @@
   __m128i coeffs_x[2];
   const int bits = FILTER_BITS;
   const int dst_stride = width2;
-  const int remain_col = filtered_length % 16;
   const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
 
@@ -215,15 +216,27 @@
 
   for (int i = 0; i < height; ++i) {
     int filter_offset = 0;
+    int row01_offset = ROW_OFFSET;
+    int remain_col = filtered_length;
+    // To avoid pixel over-read at frame boundary, processing of 16 pixels
+    // is done using the core loop only if sufficient number of pixels required
+    // for the load are present.The remaining pixels are processed separately.
     for (int j = 0; j <= filtered_length - 16; j += 16) {
+      if (remain_col == 18 || remain_col == 20) {
+        break;
+      }
+      const int is_last_cols16 = (j == filtered_length - 16);
+      // While processing the last 16 pixels of the row, ensure that only valid
+      // pixels are loaded.
+      if (is_last_cols16) row01_offset = 0;
       const int in_idx = i * in_stride + j - filter_offset;
       const int out_idx = i * dst_stride + j / 2;
-
+      remain_col -= 16;
       // a0 a1 a2 a3 .... a15
       __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
       // a8 a9 a10 a11 .... a23
-      __m128i row01 =
-          _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
+      __m128i row01 = _mm_loadu_si128(
+          (__m128i *)&input[in_idx + row01_offset + filter_offset]);
       filter_offset = 3;
 
       // Pad start pixels to the left, while processing the first pixels in the
@@ -237,11 +250,11 @@
 
       // Pad end pixels to the right, while processing the last pixels in the
       // row.
-      const int is_last_cols16 = (j == filtered_length - 16);
       if (is_last_cols16) {
         const __m128i end_pixel_row0 =
             _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
-        row01 = blend(row01, end_pixel_row0, end_pad_mask);
+        row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0,
+                      end_pad_mask);
       }
 
       // a2 a3 a4 a5 a6 a7 a8 a9 .... a17
@@ -318,10 +331,6 @@
     }
 
     int wd_processed = filtered_length - remain_col;
-    // When the remaining width is 2, the above code would not have taken
-    // care of padding required for (filtered_length - 4)th pixel. Hence,
-    // process that pixel again with the C code.
-    wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
     if (remain_col) {
       const int in_idx = (in_stride * i);
       const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 83e56ed..9145803 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <memory>
+#include <new>
+
 #include "config/av1_rtcd.h"
 #include "test/acm_random.h"
 #include "test/util.h"
@@ -63,12 +66,18 @@
     height_ = std::get<1>(frame_dim_);
     const int msb = get_msb(AOMMIN(width_, height_));
     n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+    const int src_buf_size = (width_ / 2) * height_;
+    const int dest_buf_size = (width_ * height_) / 4;
+    src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]);
+    ASSERT_NE(src_, nullptr);
 
-    src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_));
     ref_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(ref_dest_, nullptr);
+
     test_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(test_dest_, nullptr);
   }
 
   void RunTest() {
@@ -76,11 +85,12 @@
     for (int level = 1; level < n_levels_; level++) {
       const int width2 = (width_ >> level);
       const int height2 = (height_ >> level);
-      av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
-                            width2, 0);
-      test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+      av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1,
+                            height2, width2, 0);
+      test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2,
+                width2, 0);
 
-      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
+      AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height2);
     }
   }
 
@@ -92,8 +102,8 @@
       aom_usec_timer ref_timer;
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < kIters; j++) {
-        av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
-                              width2, 0);
+        av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1,
+                              height2, width2, 0);
       }
       aom_usec_timer_mark(&ref_timer);
       const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -101,7 +111,8 @@
       aom_usec_timer tst_timer;
       aom_usec_timer_start(&tst_timer);
       for (int j = 0; j < kIters; j++) {
-        test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+        test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2,
+                  width2, 0);
       }
       aom_usec_timer_mark(&tst_timer);
       const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
@@ -112,21 +123,15 @@
     }
   }
 
-  void TearDown() {
-    aom_free(src_);
-    aom_free(ref_dest_);
-    aom_free(test_dest_);
-  }
-
  private:
   LowBDResizeFunc test_fun_;
   FrameDimension frame_dim_;
   int width_;
   int height_;
   int n_levels_;
-  uint8_t *src_;
-  uint8_t *ref_dest_;
-  uint8_t *test_dest_;
+  std::unique_ptr<uint8_t[]> src_;
+  std::unique_ptr<uint8_t[]> ref_dest_;
+  std::unique_ptr<uint8_t[]> test_dest_;
   libaom_test::ACMRandom rng_;
 };
 
@@ -141,7 +146,9 @@
 const FrameDimension kFrameDim[] = {
   make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
   make_tuple(1280, 720),  make_tuple(640, 480),   make_tuple(640, 360),
-  make_tuple(256, 256),
+  make_tuple(286, 286),   make_tuple(284, 284),   make_tuple(282, 282),
+  make_tuple(280, 280),   make_tuple(262, 262),   make_tuple(258, 258),
+  make_tuple(256, 256),   make_tuple(34, 34),
 };
 #endif
 
@@ -174,11 +181,18 @@
     height_ = std::get<1>(frame_dim_);
     const int msb = get_msb(AOMMIN(width_, height_));
     n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
-    src_ = (uint8_t *)aom_malloc(width_ * height_ * sizeof(*src_));
+    const int src_buf_size = width_ * height_;
+    const int dest_buf_size = (width_ * height_) / 2;
+    src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]);
+    ASSERT_NE(src_, nullptr);
+
     ref_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*ref_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(ref_dest_, nullptr);
+
     test_dest_ =
-        (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*test_dest_));
+        std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+    ASSERT_NE(test_dest_, nullptr);
   }
 
   void RunTest() {
@@ -186,10 +200,11 @@
 
     for (int level = 1; level < n_levels_; ++level) {
       const int width2 = (width_ >> level);
-      av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
-                            width2);
-      test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
-      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height_);
+      av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_,
+                            width2 << 1, width2);
+      test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1,
+                width2);
+      AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height_);
     }
   }
 
@@ -201,8 +216,8 @@
       aom_usec_timer ref_timer;
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < kIters; ++j) {
-        av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
-                              width2);
+        av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_,
+                              width2 << 1, width2);
       }
       aom_usec_timer_mark(&ref_timer);
       const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -210,7 +225,8 @@
       aom_usec_timer tst_timer;
       aom_usec_timer_start(&tst_timer);
       for (int j = 0; j < kIters; ++j) {
-        test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
+        test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1,
+                  width2);
       }
       aom_usec_timer_mark(&tst_timer);
       const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
@@ -221,21 +237,15 @@
     }
   }
 
-  void TearDown() {
-    aom_free(src_);
-    aom_free(ref_dest_);
-    aom_free(test_dest_);
-  }
-
  private:
   LowBDResize_x_Func test_fun_;
   FrameDimension frame_dim_;
   int width_;
   int height_;
   int n_levels_;
-  uint8_t *src_;
-  uint8_t *ref_dest_;
-  uint8_t *test_dest_;
+  std::unique_ptr<uint8_t[]> src_;
+  std::unique_ptr<uint8_t[]> ref_dest_;
+  std::unique_ptr<uint8_t[]> test_dest_;
   libaom_test::ACMRandom rng_;
 };
 
@@ -245,9 +255,7 @@
 
 TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
 
-// TODO(https://crbug.com/aomedia/3575): Reenable this after test passes under
-// 32-bit valgrind.
-#if 0  // HAVE_SSE2
+#if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1ResizeXTest,
     ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),