RTC: Add SSE4_1 optimization for aom_vector_var

This commit also fixes a potential integer overflow issue in the C code
when the function is applied to BLOCK_128X128.

Performance:
Functional Level:
| Width | C Code | SSE4_1 | Gain |
|-------|--------|--------|------|
|   16  |  42430 |  25017 | 1.70 |
|   32  |  69805 |  31255 | 2.28 |
|   64  | 126035 |  43659 | 2.80 |
|  128  | 238695 |  72269 | 3.30 |

Encoder Level:
| SPD_SET | TESTSET  | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|----------|----------|----------|---------|-------|
|    7    |   rtc    | +0.000%  | +0.000%  | +0.000% | -1.2% |
|    7    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -1.2% |
|---------|----------|----------|----------|---------|-------|
|    8    |   rtc    | +0.000%  | +0.000%  | +0.000% | -1.2% |
|    8    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -0.8% |
|---------|----------|----------|----------|---------|-------|
|    9    |   rtc    | +0.000%  | +0.000%  | +0.000% | -0.1% |
|    9    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -0.1% |
|---------|----------|----------|----------|---------|-------|
|   10    |   rtc    | +0.000%  | +0.000%  | +0.000% | -0.1% |
|   10    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -0.1% |

Change-Id: If02b50a037b9f4b87fc57d471a6998130f575d41
diff --git a/test/avg_test.cc b/test/avg_test.cc
index b12d1ef..93f4c34 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -636,7 +636,7 @@
 }
 TEST_P(VectorVarTest, DISABLED_Speed) {
   FillRandom();
-  const int numIter = 50000;
+  const int numIter = 5000000;
   printf("Width = %d number of iteration is %d \n", width, numIter);
 
   int sum_c_var = 0;
@@ -942,6 +942,16 @@
                       make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(5, &aom_vector_var_c,
+                                 &aom_vector_var_sse4_1)));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, SatdTest,