RTC: Add SSE4_1 optimization for aom_vector_var

This commit also fixes a potential integer overflow issue in the C code
when the function is applied to BLOCK_128X128.

Performance:
Functional Level:
| Width | C Code | SSE4_1 | Gain |
|-------|--------|--------|------|
|   16  |  42430 |  25017 | 1.70 |
|   32  |  69805 |  31255 | 2.28 |
|   64  | 126035 |  43659 | 2.80 |
|  128  | 238695 |  72269 | 3.30 |

Encoder Level:
| SPD_SET | TESTSET  | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|----------|----------|----------|---------|-------|
|    7    |   rtc    | +0.000%  | +0.000%  | +0.000% | -1.2% |
|    7    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -1.2% |
|---------|----------|----------|----------|---------|-------|
|    8    |   rtc    | +0.000%  | +0.000%  | +0.000% | -1.2% |
|    8    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -0.8% |
|---------|----------|----------|----------|---------|-------|
|    9    |   rtc    | +0.000%  | +0.000%  | +0.000% | -0.1% |
|    9    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -0.1% |
|---------|----------|----------|----------|---------|-------|
|   10    |   rtc    | +0.000%  | +0.000%  | +0.000% | -0.1% |
|   10    | rtc_derf | +0.000%  | +0.000%  | +0.000% | -0.1% |

Change-Id: If02b50a037b9f4b87fc57d471a6998130f575d41
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 7e2f570..29a9d3f 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -250,7 +250,9 @@
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
 
-  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 4503c8e..1928de0 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1201,7 +1201,7 @@
   specialize qw/aom_int_pro_col sse2 neon/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
-  specialize qw/aom_vector_var neon/;
+  specialize qw/aom_vector_var sse4_1 neon/;
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
   #specialize qw/aom_vector_var neon sse2/;
 
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 9f5b545..593807b 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -187,10 +187,11 @@
     v_sse = vmlal_s16(v_sse, v_high, v_high);
 #endif
   }
-  int mean = horizontal_add_s32x4(v_mean);
-  int sse = horizontal_add_s32x4(v_sse);
+  const int mean = horizontal_add_s32x4(v_mean);
+  const int sse = horizontal_add_s32x4(v_sse);
+  const unsigned int mean_abs = mean >= 0 ? mean : -mean;
   // (mean * mean): dynamic range 31 bits.
-  int var = sse - ((mean * mean) >> (bwl + 2));
+  const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
   return var;
 }
 
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 1e48bc1..a3821e6 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -547,6 +547,9 @@
   }
 
   // (mean * mean): dynamic range 31 bits.
-  var = sse - ((mean * mean) >> (bwl + 2));
+  // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
+  // 31.99, so it needs to be casted to unsigned int to compute its square.
+  const unsigned int mean_abs = mean >= 0 ? mean : -mean;
+  var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
   return var;
 }
diff --git a/aom_dsp/x86/avg_intrin_sse4.c b/aom_dsp/x86/avg_intrin_sse4.c
new file mode 100644
index 0000000..55a483e
--- /dev/null
+++ b/aom_dsp/x86/avg_intrin_sse4.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src,
+                          const int log_bw) {
+  const int width = 4 << log_bw;
+  assert(width % 16 == 0);
+
+  const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+  __m128i mean = _mm_setzero_si128();
+  __m128i sse = _mm_setzero_si128();
+
+  for (int i = 0; i < width; i += 16) {
+    const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
+    const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
+    const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
+    __m128i diff = _mm_sub_epi16(ref_line, src_line);
+    const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
+    __m128i diff_sqr = _mm_madd_epi16(diff, diff);
+    const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
+
+    diff = _mm_add_epi16(diff, diff2);
+    diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
+    diff = _mm_madd_epi16(diff, k_one_epi16);
+    sse = _mm_add_epi32(sse, diff_sqr);
+
+    mean = _mm_add_epi32(mean, diff);
+
+    src += 16;
+    ref += 16;
+  }
+
+  mean = _mm_hadd_epi32(mean, mean);
+  sse = _mm_hadd_epi32(sse, sse);
+  mean = _mm_hadd_epi32(mean, mean);
+  sse = _mm_hadd_epi32(sse, sse);
+
+  // (mean * mean): dynamic range 31 bits.
+  const int mean_int = _mm_extract_epi32(mean, 0);
+  const int sse_int = _mm_extract_epi32(sse, 0);
+  const unsigned int mean_abs = mean_int >= 0 ? mean_int : -mean_int;
+  const int var = sse_int - ((mean_abs * mean_abs) >> (log_bw + 2));
+  return var;
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index b12d1ef..93f4c34 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -636,7 +636,7 @@
 }
 TEST_P(VectorVarTest, DISABLED_Speed) {
   FillRandom();
-  const int numIter = 50000;
+  const int numIter = 5000000;
   printf("Width = %d number of iteration is %d \n", width, numIter);
 
   int sum_c_var = 0;
@@ -942,6 +942,16 @@
                       make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_sse4_1),
+                      make_tuple(5, &aom_vector_var_c,
+                                 &aom_vector_var_sse4_1)));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, SatdTest,