Fix memory over-read issue in av1_resize_horz_dir() SIMD
This CL fixes the test failures under 32-bit valgrind due
to memory over-read issue reported in Bug: aomedia:3575.
To fix this issue pixel overloading at frame boundary is
avoided. Also av1_resize_horz_dir_sse2() is enabled.
Bug: aomedia:3575
Change-Id: I50d87adb033c7e2cab036d66d49c11b5b81469ca
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7c4f539..7fa8b4e 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -558,9 +558,7 @@
specialize qw/av1_resize_vert_dir sse2 avx2/;
add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
-# TODO(https://crbug.com/aomedia/3575): Restore sse2 after SSE2/AV1ResizeXTest
-# passes under 32-bit valgrind.
-specialize qw/av1_resize_horz_dir avx2/;
+specialize qw/av1_resize_horz_dir sse2 avx2/;
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 425c9f4..9c8958e 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -17,6 +17,7 @@
#include "aom_dsp/x86/synonyms.h"
+#define ROW_OFFSET 5
#define CAST_HI(x) _mm256_castsi128_si256(x)
#define CAST_LOW(x) _mm256_castsi256_si128(x)
@@ -122,7 +123,7 @@
filter_offset = 3; \
\
/* Pad start pixels to the left, while processing the first pixels in the \
- row. */ \
+ * row. */ \
if (j == 0) { \
/* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \
row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \
@@ -131,21 +132,24 @@
r0 = row0; \
r1 = row1; \
} \
- \
+ const int is_last_cols32 = (j + 32 == filtered_length); \
+ /* Avoid loading extra pixels at frame boundary.*/ \
+ if (is_last_cols32) row_offset = ROW_OFFSET; \
/* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \
__m128i row0_0 = _mm_loadl_epi64( \
- (__m128i *)&input[i * in_stride + 32 + j - filter_offset]); \
+ (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \
/* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \
- __m128i row1_0 = _mm_loadl_epi64( \
- (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]); \
+ __m128i row1_0 = \
+ _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j - \
+ filter_offset - row_offset]); \
__m256i r2 = _mm256_permute2x128_si256( \
_mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \
\
/* Pad end pixels to the right, while processing the last pixels in the \
- row. */ \
- const int is_last_cols32 = (j + 32 == filtered_length); \
+ * row. */ \
if (is_last_cols32) { \
- r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask); \
+ r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET), \
+ wd32_end_pad_mask); \
} \
\
/* Process even pixels of the first row */ \
@@ -169,7 +173,8 @@
s1[3] = _mm256_alignr_epi8(r2, r1, 6); \
\
/* The register res_out_0 stores the result of start-16 pixels corresponding \
-to the first and second rows whereas res_out_1 stores the end-16 pixels. */ \
+ * to the first and second rows whereas res_out_1 stores the end-16 \
+ * pixels. */ \
__m256i res_out_0[2], res_out_1[2]; \
res_out_1[0] = res_out_1[1] = zero; \
res_out_0[0] = res_out_0[1] = zero; \
@@ -184,7 +189,7 @@
/* r00-r03 r08-r011 | r04-r07 r012-r015 */ \
__m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \
\
- /* result of 32 pixels of row1 (b0 to b32) */ \
+ /* Result of 32 pixels of row1 (b0 to b32) */ \
res_out_0[1] = _mm256_sra_epi32( \
_mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \
res_out_1[1] = _mm256_sra_epi32( \
@@ -530,12 +535,10 @@
uint8_t *intbuf, int height, int filtered_length,
int width2) {
assert(height % 2 == 0);
- // Invoke C for width less than 32.
- // TODO(https://crbug.com/aomedia/3575): Use sse2 after SSE2/AV1ResizeXTest
- // passes under 32-bit valgrind.
+ // Invoke SSE2 for width less than 32.
if (filtered_length < 32) {
- av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
- width2);
+ av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
+ width2);
return;
}
@@ -569,6 +572,7 @@
if (filtered_length % 32 == 0) {
for (int i = 0; i < height; i += 2) {
int filter_offset = 0;
+ int row_offset = 0;
for (int j = 0; j < filtered_length; j += 32) {
PROCESS_RESIZE_X_WD32
}
@@ -576,28 +580,50 @@
} else {
for (int i = 0; i < height; i += 2) {
int filter_offset = 0;
- int remain_col = filtered_length % 32;
- for (int j = 0; j + 32 <= filtered_length; j += 32) {
+ int remain_col = filtered_length;
+ int row_offset = 0;
+ // To avoid pixel over-read at frame boundary, processing of 32 pixels
+ // is done using the core loop only if sufficient number of pixels
+ // required for the load are present. The remaining pixels are processed
+ // separately.
+ for (int j = 0; j <= filtered_length - 32; j += 32) {
+ if (remain_col == 34 || remain_col == 36) {
+ break;
+ }
PROCESS_RESIZE_X_WD32
+ remain_col -= 32;
}
int wd_processed = filtered_length - remain_col;
- if (remain_col > 15) {
- remain_col = filtered_length % 16;
- const int in_idx = i * in_stride + wd_processed - filter_offset;
+ // To avoid pixel over-read at frame boundary, processing of 16 pixels
+ // is done only if sufficient number of pixels required for the
+ // load are present. The remaining pixels are processed separately.
+ if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
+ remain_col = filtered_length - wd_processed - 16;
+ const int in_idx = i * in_stride + wd_processed;
const int out_idx = (i * dst_stride) + wd_processed / 2;
// a0 a1 --- a15
- __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+ __m128i row0 =
+ _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]);
// b0 b1 --- b15
- __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+ __m128i row1 = _mm_loadu_si128(
+ (__m128i *)&input[in_idx + in_stride - filter_offset]);
// a0 a1 --- a15 || b0 b1 --- b15
__m256i r0 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+ if (filter_offset == 0) {
+ r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
+ }
+ filter_offset = 3;
+ const int is_last_cols16 = wd_processed + 16 == filtered_length;
+ if (is_last_cols16) row_offset = ROW_OFFSET;
// a16 a17 --- a23
- row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
+ row0 = _mm_loadl_epi64(
+ (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]);
// b16 b17 --- b23
- row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
+ row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride -
+ row_offset - filter_offset]);
// a16-a23 x x x x| b16-b23 x x x x
__m256i r1 =
@@ -605,9 +631,9 @@
// Pad end pixels to the right, while processing the last pixels in the
// row.
- const int is_last_cols16 = wd_processed + 16 == filtered_length;
if (is_last_cols16) {
- r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
+ r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
+ wd32_end_pad_mask);
}
// a0 a1 --- a15 || b0 b1 --- b15
@@ -624,7 +650,7 @@
res_out_0[0] = res_out_0[1] = zero;
resize_convolve(s0, coeffs_x, res_out_0);
- // r00 -r07
+ // r00-r07
res_out_0[0] = _mm256_sra_epi32(
_mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
// r10-r17
@@ -647,23 +673,30 @@
_mm_unpackhi_epi64(low_result, low_result));
}
+ // To avoid pixel over-read at frame boundary, processing of 8 pixels
+ // is done only if sufficient number of pixels required for the
+ // load are present. The remaining pixels are processed by C function.
wd_processed = filtered_length - remain_col;
- if (remain_col > 7) {
- remain_col = filtered_length % 8;
+ if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
+ remain_col = filtered_length - wd_processed - 8;
const int in_idx = i * in_stride + wd_processed - filter_offset;
const int out_idx = (i * dst_stride) + wd_processed / 2;
+ const int is_last_cols_8 = wd_processed + 8 == filtered_length;
+ if (is_last_cols_8) row_offset = ROW_OFFSET;
// a0 a1 --- a15
- __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+ __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]);
// b0 b1 --- b15
- __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+ __m128i row1 =
+ _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]);
// a0 a1 --- a15 || b0 b1 --- b15
__m256i r0 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
// Pad end pixels to the right, while processing the last pixels in the
// row.
- const int is_last_cols_8 = wd_processed + 8 == filtered_length;
- if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
+ if (is_last_cols_8)
+ r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
+ wd8_end_pad_mask);
// a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
s0[0] = r0;
@@ -673,6 +706,7 @@
s0[2] = _mm256_bsrli_epi128(r0, 4);
// a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
s0[3] = _mm256_bsrli_epi128(r0, 6);
+
__m256i res_out_0[2];
res_out_0[0] = res_out_0[1] = zero;
resize_convolve(s0, coeffs_x, res_out_0);
@@ -696,10 +730,6 @@
}
wd_processed = filtered_length - remain_col;
- // When the remaining width is 2, the above code would not have taken
- // care of padding required for (filtered_length - 4)th pixel. Hence,
- // process that pixel again with the C code.
- wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
if (remain_col) {
const int in_idx = (in_stride * i);
const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 6b34ceb..e2d84da 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -16,6 +16,8 @@
#include "aom_dsp/x86/synonyms.h"
+#define ROW_OFFSET 5
+
#define PROCESS_RESIZE_Y_WD8 \
/* ah0 ah1 ... ah7 */ \
const __m128i AH = _mm_add_epi16(l0, l7); \
@@ -200,7 +202,6 @@
__m128i coeffs_x[2];
const int bits = FILTER_BITS;
const int dst_stride = width2;
- const int remain_col = filtered_length % 16;
const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
@@ -215,15 +216,27 @@
for (int i = 0; i < height; ++i) {
int filter_offset = 0;
+ int row01_offset = ROW_OFFSET;
+ int remain_col = filtered_length;
+ // To avoid pixel over-read at frame boundary, processing of 16 pixels
+ // is done using the core loop only if sufficient number of pixels required
+ // for the load are present.The remaining pixels are processed separately.
for (int j = 0; j <= filtered_length - 16; j += 16) {
+ if (remain_col == 18 || remain_col == 20) {
+ break;
+ }
+ const int is_last_cols16 = (j == filtered_length - 16);
+ // While processing the last 16 pixels of the row, ensure that only valid
+ // pixels are loaded.
+ if (is_last_cols16) row01_offset = 0;
const int in_idx = i * in_stride + j - filter_offset;
const int out_idx = i * dst_stride + j / 2;
-
+ remain_col -= 16;
// a0 a1 a2 a3 .... a15
__m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
// a8 a9 a10 a11 .... a23
- __m128i row01 =
- _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
+ __m128i row01 = _mm_loadu_si128(
+ (__m128i *)&input[in_idx + row01_offset + filter_offset]);
filter_offset = 3;
// Pad start pixels to the left, while processing the first pixels in the
@@ -237,11 +250,11 @@
// Pad end pixels to the right, while processing the last pixels in the
// row.
- const int is_last_cols16 = (j == filtered_length - 16);
if (is_last_cols16) {
const __m128i end_pixel_row0 =
_mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
- row01 = blend(row01, end_pixel_row0, end_pad_mask);
+ row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0,
+ end_pad_mask);
}
// a2 a3 a4 a5 a6 a7 a8 a9 .... a17
@@ -318,10 +331,6 @@
}
int wd_processed = filtered_length - remain_col;
- // When the remaining width is 2, the above code would not have taken
- // care of padding required for (filtered_length - 4)th pixel. Hence,
- // process that pixel again with the C code.
- wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
if (remain_col) {
const int in_idx = (in_stride * i);
const int out_idx = (wd_processed / 2) + width2 * i;
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 83e56ed..9145803 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <memory>
+#include <new>
+
#include "config/av1_rtcd.h"
#include "test/acm_random.h"
#include "test/util.h"
@@ -63,12 +66,18 @@
height_ = std::get<1>(frame_dim_);
const int msb = get_msb(AOMMIN(width_, height_));
n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+ const int src_buf_size = (width_ / 2) * height_;
+ const int dest_buf_size = (width_ * height_) / 4;
+ src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]);
+ ASSERT_NE(src_, nullptr);
- src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_));
ref_dest_ =
- (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_));
+ std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+ ASSERT_NE(ref_dest_, nullptr);
+
test_dest_ =
- (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_));
+ std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+ ASSERT_NE(test_dest_, nullptr);
}
void RunTest() {
@@ -76,11 +85,12 @@
for (int level = 1; level < n_levels_; level++) {
const int width2 = (width_ >> level);
const int height2 = (height_ >> level);
- av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
- width2, 0);
- test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+ av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1,
+ height2, width2, 0);
+ test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2,
+ width2, 0);
- AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
+ AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height2);
}
}
@@ -92,8 +102,8 @@
aom_usec_timer ref_timer;
aom_usec_timer_start(&ref_timer);
for (int j = 0; j < kIters; j++) {
- av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
- width2, 0);
+ av1_resize_vert_dir_c(src_.get(), ref_dest_.get(), width2, height2 << 1,
+ height2, width2, 0);
}
aom_usec_timer_mark(&ref_timer);
const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -101,7 +111,8 @@
aom_usec_timer tst_timer;
aom_usec_timer_start(&tst_timer);
for (int j = 0; j < kIters; j++) {
- test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+ test_fun_(src_.get(), test_dest_.get(), width2, height2 << 1, height2,
+ width2, 0);
}
aom_usec_timer_mark(&tst_timer);
const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
@@ -112,21 +123,15 @@
}
}
- void TearDown() {
- aom_free(src_);
- aom_free(ref_dest_);
- aom_free(test_dest_);
- }
-
private:
LowBDResizeFunc test_fun_;
FrameDimension frame_dim_;
int width_;
int height_;
int n_levels_;
- uint8_t *src_;
- uint8_t *ref_dest_;
- uint8_t *test_dest_;
+ std::unique_ptr<uint8_t[]> src_;
+ std::unique_ptr<uint8_t[]> ref_dest_;
+ std::unique_ptr<uint8_t[]> test_dest_;
libaom_test::ACMRandom rng_;
};
@@ -141,7 +146,9 @@
const FrameDimension kFrameDim[] = {
make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
make_tuple(1280, 720), make_tuple(640, 480), make_tuple(640, 360),
- make_tuple(256, 256),
+ make_tuple(286, 286), make_tuple(284, 284), make_tuple(282, 282),
+ make_tuple(280, 280), make_tuple(262, 262), make_tuple(258, 258),
+ make_tuple(256, 256), make_tuple(34, 34),
};
#endif
@@ -174,11 +181,18 @@
height_ = std::get<1>(frame_dim_);
const int msb = get_msb(AOMMIN(width_, height_));
n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
- src_ = (uint8_t *)aom_malloc(width_ * height_ * sizeof(*src_));
+ const int src_buf_size = width_ * height_;
+ const int dest_buf_size = (width_ * height_) / 2;
+ src_ = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[src_buf_size]);
+ ASSERT_NE(src_, nullptr);
+
ref_dest_ =
- (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*ref_dest_));
+ std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+ ASSERT_NE(ref_dest_, nullptr);
+
test_dest_ =
- (uint8_t *)aom_calloc((width_ * height_) / 2, sizeof(*test_dest_));
+ std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[dest_buf_size]);
+ ASSERT_NE(test_dest_, nullptr);
}
void RunTest() {
@@ -186,10 +200,11 @@
for (int level = 1; level < n_levels_; ++level) {
const int width2 = (width_ >> level);
- av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
- width2);
- test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
- AssertOutputBufferEq(ref_dest_, test_dest_, width2, height_);
+ av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_,
+ width2 << 1, width2);
+ test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1,
+ width2);
+ AssertOutputBufferEq(ref_dest_.get(), test_dest_.get(), width2, height_);
}
}
@@ -201,8 +216,8 @@
aom_usec_timer ref_timer;
aom_usec_timer_start(&ref_timer);
for (int j = 0; j < kIters; ++j) {
- av1_resize_horz_dir_c(src_, width_, ref_dest_, height_, width2 << 1,
- width2);
+ av1_resize_horz_dir_c(src_.get(), width_, ref_dest_.get(), height_,
+ width2 << 1, width2);
}
aom_usec_timer_mark(&ref_timer);
const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
@@ -210,7 +225,8 @@
aom_usec_timer tst_timer;
aom_usec_timer_start(&tst_timer);
for (int j = 0; j < kIters; ++j) {
- test_fun_(src_, width_, test_dest_, height_, width2 << 1, width2);
+ test_fun_(src_.get(), width_, test_dest_.get(), height_, width2 << 1,
+ width2);
}
aom_usec_timer_mark(&tst_timer);
const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
@@ -221,21 +237,15 @@
}
}
- void TearDown() {
- aom_free(src_);
- aom_free(ref_dest_);
- aom_free(test_dest_);
- }
-
private:
LowBDResize_x_Func test_fun_;
FrameDimension frame_dim_;
int width_;
int height_;
int n_levels_;
- uint8_t *src_;
- uint8_t *ref_dest_;
- uint8_t *test_dest_;
+ std::unique_ptr<uint8_t[]> src_;
+ std::unique_ptr<uint8_t[]> ref_dest_;
+ std::unique_ptr<uint8_t[]> test_dest_;
libaom_test::ACMRandom rng_;
};
@@ -245,9 +255,7 @@
TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
-// TODO(https://crbug.com/aomedia/3575): Reenable this after test passes under
-// 32-bit valgrind.
-#if 0 // HAVE_SSE2
+#if HAVE_SSE2
INSTANTIATE_TEST_SUITE_P(
SSE2, AV1ResizeXTest,
::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),