RTC: Add SSE4_1 optimization for aom_vector_var
This commit also fixes a potential integer overflow issue in the C code
when the function is applied to BLOCK_128X128.
Performance:
Functional Level:
| Width | C Code | SSE4_1 | Gain |
|-------|--------|--------|------|
| 16 | 42430 | 25017 | 1.70 |
| 32 | 69805 | 31255 | 2.28 |
| 64 | 126035 | 43659 | 2.80 |
| 128 | 238695 | 72269 | 3.30 |
Encoder Level:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|----------|----------|----------|---------|-------|
| 7 | rtc | +0.000% | +0.000% | +0.000% | -1.2% |
| 7 | rtc_derf | +0.000% | +0.000% | +0.000% | -1.2% |
|---------|----------|----------|----------|---------|-------|
| 8 | rtc | +0.000% | +0.000% | +0.000% | -1.2% |
| 8 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.8% |
|---------|----------|----------|----------|---------|-------|
| 9 | rtc | +0.000% | +0.000% | +0.000% | -0.1% |
| 9 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|----------|----------|----------|---------|-------|
| 10 | rtc | +0.000% | +0.000% | +0.000% | -0.1% |
| 10 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.1% |
Change-Id: If02b50a037b9f4b87fc57d471a6998130f575d41
diff --git a/test/avg_test.cc b/test/avg_test.cc
index b12d1ef..93f4c34 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -636,7 +636,7 @@
}
TEST_P(VectorVarTest, DISABLED_Speed) {
FillRandom();
- const int numIter = 50000;
+ const int numIter = 5000000;
printf("Width = %d number of iteration is %d \n", width, numIter);
int sum_c_var = 0;
@@ -942,6 +942,16 @@
make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
#endif
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(5, &aom_vector_var_c,
+ &aom_vector_var_sse4_1)));
+#endif // HAVE_SSE4_1
+
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, SatdTest,