Fix memory over-read issue in av1_resize_horz_dir() SIMD This CL fixes the test failures under 32-bit valgrind due to memory over-read issue reported in Bug: aomedia:3575. To fix this issue pixel overloading at frame boundary is avoided. Also av1_resize_horz_dir_sse2() is enabled. Bug: aomedia:3575 Change-Id: I50d87adb033c7e2cab036d66d49c11b5b81469ca
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 7c4f539..7fa8b4e 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -558,9 +558,7 @@ specialize qw/av1_resize_vert_dir sse2 avx2/; add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2"; -# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest -# passes under 32-bit valgrind. -specialize qw/av1_resize_horz_dir avx2/; +specialize qw/av1_resize_horz_dir sse2 avx2/; add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c index 425c9f4..9c8958e 100644 --- a/av1/common/x86/resize_avx2.c +++ b/av1/common/x86/resize_avx2.c
@@ -17,6 +17,7 @@ #include "aom_dsp/x86/synonyms.h" +#define ROW_OFFSET 5 #define CAST_HI(x) _mm256_castsi128_si256(x) #define CAST_LOW(x) _mm256_castsi256_si128(x) @@ -122,7 +123,7 @@ filter_offset = 3; \ \ /* Pad start pixels to the left, while processing the first pixels in the \ - row. */ \ + * row. */ \ if (j == 0) { \ /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \ row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \ @@ -131,21 +132,24 @@ r0 = row0; \ r1 = row1; \ } \ - \ + const int is_last_cols32 = (j + 32 == filtered_length); \ + /* Avoid loading extra pixels at frame boundary.*/ \ + if (is_last_cols32) row_offset = ROW_OFFSET; \ /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \ __m128i row0_0 = _mm_loadl_epi64( \ - (__m128i *)&input[i * in_stride + 32 + j - filter_offset]); \ + (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \ /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \ - __m128i row1_0 = _mm_loadl_epi64( \ - (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]); \ + __m128i row1_0 = \ + _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j - \ + filter_offset - row_offset]); \ __m256i r2 = _mm256_permute2x128_si256( \ _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \ \ /* Pad end pixels to the right, while processing the last pixels in the \ - row. */ \ - const int is_last_cols32 = (j + 32 == filtered_length); \ + * row. */ \ if (is_last_cols32) { \ - r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask); \ + r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET), \ + wd32_end_pad_mask); \ } \ \ /* Process even pixels of the first row */ \ @@ -169,7 +173,8 @@ s1[3] = _mm256_alignr_epi8(r2, r1, 6); \ \ /* The register res_out_0 stores the result of start-16 pixels corresponding \ -to the first and second rows whereas res_out_1 stores the end-16 pixels. */ \ + * to the first and second rows whereas res_out_1 stores the end-16 \ + * pixels. */ \ __m256i res_out_0[2], res_out_1[2]; \ res_out_1[0] = res_out_1[1] = zero; \ res_out_0[0] = res_out_0[1] = zero; \ @@ -184,7 +189,7 @@ /* r00-r03 r08-r011 | r04-r07 r012-r015 */ \ __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \ \ - /* result of 32 pixels of row1 (b0 to b32) */ \ + /* Result of 32 pixels of row1 (b0 to b32) */ \ res_out_0[1] = _mm256_sra_epi32( \ _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \ res_out_1[1] = _mm256_sra_epi32( \ @@ -530,12 +535,10 @@ uint8_t *intbuf, int height, int filtered_length, int width2) { assert(height % 2 == 0); - // Invoke C for width less than 32. - // TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest - // passes under 32-bit valgrind. + // Invoke SSE2 for width less than 32. if (filtered_length < 32) { - av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length, - width2); + av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length, + width2); return; } @@ -569,6 +572,7 @@ if (filtered_length % 32 == 0) { for (int i = 0; i < height; i += 2) { int filter_offset = 0; + int row_offset = 0; for (int j = 0; j < filtered_length; j += 32) { PROCESS_RESIZE_X_WD32 } @@ -576,28 +580,50 @@ } else { for (int i = 0; i < height; i += 2) { int filter_offset = 0; - int remain_col = filtered_length % 32; - for (int j = 0; j + 32 <= filtered_length; j += 32) { + int remain_col = filtered_length; + int row_offset = 0; + // To avoid pixel over-read at frame boundary, processing of 32 pixels + // is done using the core loop only if sufficient number of pixels + // required for the load are present. The remaining pixels are processed + // separately. + for (int j = 0; j <= filtered_length - 32; j += 32) { + if (remain_col == 34 || remain_col == 36) { + break; + } PROCESS_RESIZE_X_WD32 + remain_col -= 32; } int wd_processed = filtered_length - remain_col; - if (remain_col > 15) { - remain_col = filtered_length % 16; - const int in_idx = i * in_stride + wd_processed - filter_offset; + // To avoid pixel over-read at frame boundary, processing of 16 pixels + // is done only if sufficient number of pixels required for the + // load are present. The remaining pixels are processed separately. + if (remain_col > 15 && remain_col != 18 && remain_col != 20) { + remain_col = filtered_length - wd_processed - 16; + const int in_idx = i * in_stride + wd_processed; const int out_idx = (i * dst_stride) + wd_processed / 2; // a0 a1 --- a15 - __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]); + __m128i row0 = + _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]); // b0 b1 --- b15 - __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]); + __m128i row1 = _mm_loadu_si128( + (__m128i *)&input[in_idx + in_stride - filter_offset]); // a0 a1 --- a15 || b0 b1 --- b15 __m256i r0 = _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); + if (filter_offset == 0) { + r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); + } + filter_offset = 3; + const int is_last_cols16 = wd_processed + 16 == filtered_length; + if (is_last_cols16) row_offset = ROW_OFFSET; // a16 a17 --- a23 - row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]); + row0 = _mm_loadl_epi64( + (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]); // b16 b17 --- b23 - row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]); + row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride - + row_offset - filter_offset]); // a16-a23 x x x x| b16-b23 x x x x __m256i r1 = @@ -605,9 +631,9 @@ // Pad end pixels to the right, while processing the last pixels in the // row. - const int is_last_cols16 = wd_processed + 16 == filtered_length; if (is_last_cols16) { - r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask); + r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET), + wd32_end_pad_mask); } // a0 a1 --- a15 || b0 b1 --- b15 @@ -624,7 +650,7 @@ res_out_0[0] = res_out_0[1] = zero; resize_convolve(s0, coeffs_x, res_out_0); - // r00 -r07 + // r00-r07 res_out_0[0] = _mm256_sra_epi32( _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); // r10-r17 @@ -647,23 +673,30 @@ _mm_unpackhi_epi64(low_result, low_result)); } + // To avoid pixel over-read at frame boundary, processing of 8 pixels + // is done only if sufficient number of pixels required for the + // load are present. The remaining pixels are processed by C function. wd_processed = filtered_length - remain_col; - if (remain_col > 7) { - remain_col = filtered_length % 8; + if (remain_col > 7 && remain_col != 10 && remain_col != 12) { + remain_col = filtered_length - wd_processed - 8; const int in_idx = i * in_stride + wd_processed - filter_offset; const int out_idx = (i * dst_stride) + wd_processed / 2; + const int is_last_cols_8 = wd_processed + 8 == filtered_length; + if (is_last_cols_8) row_offset = ROW_OFFSET; // a0 a1 --- a15 - __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]); + __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]); // b0 b1 --- b15 - __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]); + __m128i row1 = + _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]); // a0 a1 --- a15 || b0 b1 --- b15 __m256i r0 = _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); // Pad end pixels to the right, while processing the last pixels in the // row. - const int is_last_cols_8 = wd_processed + 8 == filtered_length; - if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask); + if (is_last_cols_8) + r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET), + wd8_end_pad_mask); // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7 s0[0] = r0; @@ -673,6 +706,7 @@ s0[2] = _mm256_bsrli_epi128(r0, 4); // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10 s0[3] = _mm256_bsrli_epi128(r0, 6); + __m256i res_out_0[2]; res_out_0[0] = res_out_0[1] = zero; resize_convolve(s0, coeffs_x, res_out_0); @@ -696,10 +730,6 @@ } wd_processed = filtered_length - remain_col; - // When the remaining width is 2, the above code would not have taken - // care of padding required for (filtered_length - 4)th pixel. Hence, - // process that pixel again with the C code. - wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed; if (remain_col) { const int in_idx = (in_stride * i); const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c index 6b34ceb..e2d84da 100644 --- a/av1/common/x86/resize_sse2.c +++ b/av1/common/x86/resize_sse2.c
@@ -16,6 +16,8 @@ #include "aom_dsp/x86/synonyms.h" +#define ROW_OFFSET 5 + #define PROCESS_RESIZE_Y_WD8 \ /* ah0 ah1 ... ah7 */ \ const __m128i AH = _mm_add_epi16(l0, l7); \ @@ -200,7 +202,6 @@ __m128i coeffs_x[2]; const int bits = FILTER_BITS; const int dst_stride = width2; - const int remain_col = filtered_length % 16; const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); @@ -215,15 +216,27 @@ for (int i = 0; i < height; ++i) { int filter_offset = 0; + int row01_offset = ROW_OFFSET; + int remain_col = filtered_length; + // To avoid pixel over-read at frame boundary, processing of 16 pixels + // is done using the core loop only if sufficient number of pixels required + // for the load are present.The remaining pixels are processed separately. for (int j = 0; j <= filtered_length - 16; j += 16) { + if (remain_col == 18 || remain_col == 20) { + break; + } + const int is_last_cols16 = (j == filtered_length - 16); + // While processing the last 16 pixels of the row, ensure that only valid + // pixels are loaded. + if (is_last_cols16) row01_offset = 0; const int in_idx = i * in_stride + j - filter_offset; const int out_idx = i * dst_stride + j / 2; - + remain_col -= 16; // a0 a1 a2 a3 .... a15 __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]); // a8 a9 a10 a11 .... a23 - __m128i row01 = - _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]); + __m128i row01 = _mm_loadu_si128( + (__m128i *)&input[in_idx + row01_offset + filter_offset]); filter_offset = 3; // Pad start pixels to the left, while processing the first pixels in the @@ -237,11 +250,11 @@ // Pad end pixels to the right, while processing the last pixels in the // row. - const int is_last_cols16 = (j == filtered_length - 16); if (is_last_cols16) { const __m128i end_pixel_row0 = _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]); - row01 = blend(row01, end_pixel_row0, end_pad_mask); + row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0, + end_pad_mask); } // a2 a3 a4 a5 a6 a7 a8 a9 .... a17 @@ -318,10 +331,6 @@ } int wd_processed = filtered_length - remain_col; - // When the remaining width is 2, the above code would not have taken - // care of padding required for (filtered_length - 4)th pixel. Hence, - // process that pixel again with the C code. - wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed; if (remain_col) { const int in_idx = (in_stride * i); const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc index 83e56ed..9145803 100644 --- a/test/frame_resize_test.cc +++ b/test/frame_resize_test.cc
@@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <memory> +#include <new> + #include "config/av1_rtcd.h" #include "test/acm_random.h" #include "test/util.h" @@ -63,12 +66,18 @@ height_ = std::get<1>(frame_dim_); const int msb = get_msb(AOMMIN(width_, height_)); n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + const int src_buf_size = (width_ / 2) * height_; + const int dest_buf_size = (width_ * height_) / 4; + src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]); + ASSERT_NE(src_, nullptr); - src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_)); ref_dest_ = - (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_)); + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(ref_dest_, nullptr); + test_dest_ = - (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_)); + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(test_dest_, nullptr); } void RunTest() { @@ -76,11 +85,12 @@ for (int level = 1; level < n_levels_; level++) { const int width2 = (width_ >> level); const int height2 = (height_ >> level); - av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, - width2, 0); - test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0); + av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1, + height2, width2, 0); + test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2, + width2, 0); - AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2); + AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height2); } } @@ -92,8 +102,8 @@ aom_usec_timer ref_timer; aom_usec_timer_start(&ref_timer); for (int j = 0; j < kIters; j++) { - av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, - width2, 0); + av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1, + height2, width2, 0); } aom_usec_timer_mark(&ref_timer); const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); @@ -101,7 +111,8 @@ aom_usec_timer tst_timer; aom_usec_timer_start(&tst_timer); for (int j = 0; j < kIters; j++) { - test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0); + test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2, + width2, 0); } aom_usec_timer_mark(&tst_timer); const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); @@ -112,21 +123,15 @@ } } - void TearDown() { - aom_free(src_); - aom_free(ref_dest_); - aom_free(test_dest_); - } - private: LowBDResizeFunc test_fun_; FrameDimension frame_dim_; int width_; int height_; int n_levels_; - uint8_t *src_; - uint8_t *ref_dest_; - uint8_t *test_dest_; + std::unique_ptr<uint8_t[]> src_; + std::unique_ptr<uint8_t[]> ref_dest_; + std::unique_ptr<uint8_t[]> test_dest_; libaom_test::ACMRandom rng_; }; @@ -141,7 +146,9 @@ const FrameDimension kFrameDim[] = { make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080), make_tuple(1280, 720), make_tuple(640, 480), make_tuple(640, 360), - make_tuple(256, 256), + make_tuple(286, 286), make_tuple(284, 284), make_tuple(282, 282), + make_tuple(280, 280), make_tuple(262, 262), make_tuple(258, 258), + make_tuple(256, 256), make_tuple(34, 34), }; #endif @@ -174,11 +181,18 @@ height_ = std::get<1>(frame_dim_); const int msb = get_msb(AOMMIN(width_, height_)); n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - src_ = (uint8_t *)aom_malloc(width_ * height_ * sizeof(*src_)); + const int src_buf_size = width_ * height_; + const int dest_buf_size = (width_ * height_) / 2; + src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]); + ASSERT_NE(src_, nullptr); + ref_dest_ = - (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*ref_dest_)); + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(ref_dest_, nullptr); + test_dest_ = - (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*test_dest_)); + std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]); + ASSERT_NE(test_dest_, nullptr); } void RunTest() { @@ -186,10 +200,11 @@ for (int level = 1; level < n_levels_; ++level) { const int width2 = (width_ >> level); - av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1, - width2); - test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2); - AssertOutputBufferEq(ref_dest_, test_dest_, width2, height_); + av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_, + width2 << 1, width2); + test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1, + width2); + AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height_); } } @@ -201,8 +216,8 @@ aom_usec_timer ref_timer; aom_usec_timer_start(&ref_timer); for (int j = 0; j < kIters; ++j) { - av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1, - width2); + av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_, + width2 << 1, width2); } aom_usec_timer_mark(&ref_timer); const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); @@ -210,7 +225,8 @@ aom_usec_timer tst_timer; aom_usec_timer_start(&tst_timer); for (int j = 0; j < kIters; ++j) { - test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2); + test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1, + width2); } aom_usec_timer_mark(&tst_timer); const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); @@ -221,21 +237,15 @@ } } - void TearDown() { - aom_free(src_); - aom_free(ref_dest_); - aom_free(test_dest_); - } - private: LowBDResize_x_Func test_fun_; FrameDimension frame_dim_; int width_; int height_; int n_levels_; - uint8_t *src_; - uint8_t *ref_dest_; - uint8_t *test_dest_; + std::unique_ptr<uint8_t[]> src_; + std::unique_ptr<uint8_t[]> ref_dest_; + std::unique_ptr<uint8_t[]> test_dest_; libaom_test::ACMRandom rng_; }; @@ -245,9 +255,7 @@ TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); } -// TODO(https://crbug.com/aomedia/3575): Reenable this after test passes under -// 32-bit valgrind. -#if 0 // HAVE_SSE2 +#if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P( SSE2, AV1ResizeXTest, ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),