Add SSE2 and AVX2 variant for aom_var_2d
Module gains for 128x128 and 256x256
block sizes (w.r.t. C)
8bit hbd
AVX2 11x 5x
SSE2 7x 3x
Change-Id: I51ee4de74dd310e2dfd88a95e12096b72d5ca6eb
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6006367..f12d838 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -614,9 +614,11 @@
add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
specialize qw/aom_sum_squares_i16 sse2/;
- add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+ add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u8 sse2 avx2/;
add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u16 sse2 avx2/;
}
#
diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index 370bd75..c66dd52 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c
@@ -178,22 +178,22 @@
uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
int vstart, int height) {
- return aom_var_2d_u8_c(a->y_buffer + vstart * a->y_stride + hstart,
- a->y_stride, width, height) /
+ return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+ width, height) /
(width * height);
}
uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
int vstart, int height) {
- return aom_var_2d_u8_c(a->u_buffer + vstart * a->uv_stride + hstart,
- a->uv_stride, width, height) /
+ return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
(width * height);
}
uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
int vstart, int height) {
- return aom_var_2d_u8_c(a->v_buffer + vstart * a->uv_stride + hstart,
- a->uv_stride, width, height) /
+ return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
(width * height);
}
@@ -251,22 +251,22 @@
#if CONFIG_AV1_HIGHBITDEPTH
uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
int width, int vstart, int height) {
- return aom_var_2d_u16_c(a->y_buffer + vstart * a->y_stride + hstart,
- a->y_stride, width, height) /
+ return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart,
+ a->y_stride, width, height) /
(width * height);
}
uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
int width, int vstart, int height) {
- return aom_var_2d_u16_c(a->u_buffer + vstart * a->uv_stride + hstart,
- a->uv_stride, width, height) /
+ return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
(width * height);
}
uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
int width, int vstart, int height) {
- return aom_var_2d_u16_c(a->v_buffer + vstart * a->uv_stride + hstart,
- a->uv_stride, width, height) /
+ return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
(width * height);
}
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index 0af44e3..97d78b6 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -77,3 +77,172 @@
return aom_sum_squares_2d_i16_c(src, stride, width, height);
}
}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+ __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+ __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 8);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 4);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 2);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+ __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+ __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 8);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 4);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint8_t *srcp;
+ uint64_t s = 0, ss = 0;
+ __m256i vzero = _mm256_setzero_si256();
+ __m256i v_acc_sum = vzero;
+ __m256i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 32 elements in a row
+ for (i = 0; i < width - 31; i += 32) {
+ srcp = src + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 7; j += 8) {
+ __m256i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+ __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi16(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+ __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+ __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi16(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = src;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint8_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+ uint64_t s = 0, ss = 0;
+ __m256i vzero = _mm256_setzero_si256();
+ __m256i v_acc_sum = vzero;
+ __m256i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 16 elements in a row
+ for (i = 0; i < width - 15; i += 16) {
+ srcp = srcp1 + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 8; j += 8) {
+ __m256i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+ v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi32(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+ __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+ v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi32(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = srcp1;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint16_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index 22d7739..85b301a 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -201,3 +201,166 @@
return aom_sum_squares_i16_c(src, n);
}
}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+ __m128i vtmp = _mm_srli_si128(vec_a, 8);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 4);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 2);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+ __m128i vtmp = _mm_srli_si128(vec_a, 8);
+ vec_a = _mm_add_epi32(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 4);
+ vec_a = _mm_add_epi32(vec_a, vtmp);
+ return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint8_t *srcp;
+ uint64_t s = 0, ss = 0;
+ __m128i vzero = _mm_setzero_si128();
+ __m128i v_acc_sum = vzero;
+ __m128i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 16 elements in a row
+ for (i = 0; i < width - 15; i += 16) {
+ srcp = src + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 7; j += 8) {
+ __m128i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+ __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi16(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+ __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+ __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi16(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = src;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint8_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+ uint64_t s = 0, ss = 0;
+ __m128i vzero = _mm_setzero_si128();
+ __m128i v_acc_sum = vzero;
+ __m128i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 8 elements in a row
+ for (i = 0; i < width - 8; i += 8) {
+ srcp = srcp1 + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 8; j += 8) {
+ __m128i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+ v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi32(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+ __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+ v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi32(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = srcp1;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint16_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 95f4e16..c674191 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -552,4 +552,275 @@
INSTANTIATE_TEST_CASE_P(AVX2, SSE_Sum_Test,
Combine(ValuesIn(sse_sum_avx2), Range(4, 65, 4)));
#endif // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 2D Variance test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*Var2DFunc)(uint8_t *src, int stride, int width, int height);
+typedef libaom_test::FuncParam<Var2DFunc> TestFuncVar2D;
+
+const uint16_t test_block_size[2] = { 128, 256 };
+
+class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+ virtual ~Lowbd2dVarTest() {}
+ virtual void SetUp() {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, 512 * 512 * sizeof(uint8_t)));
+ ASSERT_TRUE(src_ != NULL);
+ }
+
+ virtual void TearDown() {
+ libaom_test::ClearSystemState();
+ aom_free(src_);
+ }
+ void RunTest(int isRandom);
+ void RunSpeedTest();
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 7; // Up to 8 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride) {
+ const int msb = 7; // Up to 8 bit input
+ const int limit = 1 << (msb + 1);
+ const int val = rnd_(2) ? limit - 1 : 0;
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestFuncVar2D params_;
+ uint8_t *src_;
+ ACMRandom rnd_;
+};
+
+void Lowbd2dVarTest::RunTest(int isRandom) {
+ int failed = 0;
+ for (int k = 0; k < kNumIterations; k++) {
+ const int width = 4 * (rnd_(63) + 1); // Up to 256x256
+ const int height = 4 * (rnd_(63) + 1); // Up to 256x256
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ if (isRandom) {
+ GenRandomData(width, height, stride);
+ } else {
+ GenExtremeData(width, height, stride);
+ }
+
+ const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+ uint64_t res_tst;
+ ASM_REGISTER_STATE_CHECK(res_tst =
+ params_.tst_func(src_, stride, width, height));
+
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error: Sum Squares Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+}
+
+void Lowbd2dVarTest::RunSpeedTest() {
+ for (int block = 0; block < 2; block++) {
+ const int width = test_block_size[block];
+ const int height = test_block_size[block];
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(src_, stride, width, height);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(src_, stride, width, height);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+ (double)elapsed_time / elapsed_time1);
+ }
+}
+
+TEST_P(Lowbd2dVarTest, OperationCheck) {
+ RunTest(1); // GenRandomData
+}
+
+TEST_P(Lowbd2dVarTest, ExtremeValues) {
+ RunTest(0); // GenExtremeData
+}
+
+TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(SSE2, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+ &aom_var_2d_u8_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+ &aom_var_2d_u8_avx2)));
+
+#endif // HAVE_SSE2
+
+class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+ virtual ~Highbd2dVarTest() {}
+ virtual void SetUp() {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, 512 * 512 * sizeof(uint16_t)));
+ ASSERT_TRUE(src_ != NULL);
+ }
+
+ virtual void TearDown() {
+ libaom_test::ClearSystemState();
+ aom_free(src_);
+ }
+ void RunTest(int isRandom);
+ void RunSpeedTest();
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ const int val = rnd_(2) ? limit - 1 : 0;
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestFuncVar2D params_;
+ uint16_t *src_;
+ ACMRandom rnd_;
+};
+
+void Highbd2dVarTest::RunTest(int isRandom) {
+ int failed = 0;
+ for (int k = 0; k < kNumIterations; k++) {
+ const int width = 4 * (rnd_(63) + 1); // Up to 256x256
+ const int height = 4 * (rnd_(63) + 1); // Up to 256x256
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ if (isRandom) {
+ GenRandomData(width, height, stride);
+ } else {
+ GenExtremeData(width, height, stride);
+ }
+
+ const uint64_t res_ref =
+ params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+ uint64_t res_tst;
+ ASM_REGISTER_STATE_CHECK(
+ res_tst =
+ params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
+
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error: Sum Squares Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+}
+
+void Highbd2dVarTest::RunSpeedTest() {
+ for (int block = 0; block < 2; block++) {
+ const int width = test_block_size[block];
+ const int height = test_block_size[block];
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+ (double)elapsed_time / elapsed_time1);
+ }
+}
+
+TEST_P(Highbd2dVarTest, OperationCheck) {
+ RunTest(1); // GenRandomData
+}
+
+TEST_P(Highbd2dVarTest, ExtremeValues) {
+ RunTest(0); // GenExtremeData
+}
+
+TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(SSE2, Highbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c,
+ &aom_var_2d_u16_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, Highbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c,
+ &aom_var_2d_u16_avx2)));
+
+#endif // HAVE_SSE2
} // namespace