Implement  aom_variance{128,64,32}_sse2/avx2

Add sse2 and avx2 version of aom_variance for size with 128
128x128, 128x64, 64x128, 128x32, 32x128

The speed test in  unittest shows sse2 version is 5x~6x
faster than C version, and avx2 version is 9x~11x faster
For encoder time, 20 frames of foreman_cip shows 2% speedup.

Change-Id: Ibf91514f6c876d47d56081fab54614f7c3609666
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e821aba..f9ab56b 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -981,7 +981,11 @@
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
   }
-
+  specialize qw/aom_variance128x128   sse2 avx2         /;
+  specialize qw/aom_variance128x64    sse2 avx2         /;
+  specialize qw/aom_variance64x128    sse2 avx2         /;
+  specialize qw/aom_variance128x32    sse2 avx2         /;
+  specialize qw/aom_variance32x128    sse2 avx2         /;
   specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x64     sse2      neon msa/;
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index a041bba..0d94e6c 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -47,9 +47,7 @@
                                     unsigned int *sse) {
   int sum;
   unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                aom_get16x16var_avx2, 16);
-
+  aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
   variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
   _mm256_zeroupper();
   return variance;
@@ -64,57 +62,28 @@
   return *sse;
 }
 
-unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get32x32var_avx2, 32);
+#define AOM_VAR_AVX2(bw, bh, w, bits)                                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    unsigned int variance;                                                    \
+    variance_avx2(src, src_stride, ref, ref_stride, bw, bh, sse, &sum,        \
+                  aom_get##w##x##w##var_avx2, w);                             \
+    variance = *sse - (uint32_t)(((int64_t)sum * sum) >> bits);               \
+    _mm256_zeroupper();                                                       \
+    return variance;                                                          \
+  }
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
-
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get32x32var_avx2, 32);
-
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
-
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
-  _mm256_zeroupper();
-  return variance;
-}
+AOM_VAR_AVX2(32, 16, 16, 9);
+AOM_VAR_AVX2(32, 32, 32, 10);
+AOM_VAR_AVX2(64, 64, 32, 12);
+AOM_VAR_AVX2(64, 32, 32, 11);
+AOM_VAR_AVX2(128, 128, 32, 14);
+AOM_VAR_AVX2(128, 64, 32, 13);
+AOM_VAR_AVX2(64, 128, 32, 13);
+AOM_VAR_AVX2(128, 32, 32, 12);
+AOM_VAR_AVX2(32, 128, 32, 12);
 
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index af4c8ea..cd0605a 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -249,71 +249,29 @@
   return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
 }
 
-unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 32);
-  assert(sum >= -255 * 32 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
+#define AOM_VAR_16_SSE2(bw, bh, bits)                                         \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_sse2(src, src_stride, ref, ref_stride, bw, bh, sse, &sum,        \
+                  aom_get16x16var_sse2, 16);                                  \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
-
-unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
-
-unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 64);
-  assert(sum >= -255 * 64 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
-}
-
-unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
-
-unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_16_SSE2(32, 32, 10);
+AOM_VAR_16_SSE2(32, 16, 9);
+AOM_VAR_16_SSE2(16, 32, 9);
+AOM_VAR_16_SSE2(64, 64, 12);
+AOM_VAR_16_SSE2(64, 32, 11);
+AOM_VAR_16_SSE2(32, 64, 11);
+AOM_VAR_16_SSE2(128, 128, 14);
+AOM_VAR_16_SSE2(128, 64, 13);
+AOM_VAR_16_SSE2(64, 128, 13);
+AOM_VAR_16_SSE2(128, 32, 12);
+AOM_VAR_16_SSE2(32, 128, 12);
 
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 391031c..856817e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -395,6 +395,7 @@
   void RefTest();
   void RefStrideTest();
   void OneQuarterTest();
+  void SpeedTest();
 
   // MSE/SSE tests
   void RefTestMse();
@@ -515,6 +516,31 @@
   EXPECT_EQ(expected, var);
 }
 
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  for (int j = 0; j < block_size(); j++) {
+    if (!use_high_bit_depth()) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+    }
+  }
+  unsigned int sse1, sse2, var1, var2;
+  const int stride = width();
+  int run_time = 1000000000 / block_size();
+
+  ASM_REGISTER_STATE_CHECK(var1 =
+                               params_.func(src_, stride, ref_, stride, &sse1));
+  for (int i = 0; i < run_time; ++i) {
+    ASM_REGISTER_STATE_CHECK(
+        var2 = params_.func(src_, stride, ref_, stride, &sse2));
+  }
+  EXPECT_EQ(var1, var2);
+  EXPECT_EQ(sse1, sse2);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
@@ -586,17 +612,17 @@
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
     if (!use_high_bit_depth()) {
-      src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size()));
-      sec_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size()));
+      src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
       ref_ = reinterpret_cast<uint8_t *>(
-          aom_memalign(16, block_size() + width() + height() + 1));
+          aom_memalign(32, block_size() + width() + height() + 1));
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          aom_memalign(16, block_size() * sizeof(uint16_t))));
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
       sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          aom_memalign(16, block_size() * sizeof(uint16_t))));
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
       ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
-          16, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
+          32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
     }
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(sec_ != NULL);
@@ -791,6 +817,7 @@
 TEST_P(AvxVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
@@ -816,7 +843,12 @@
 typedef TestParams<VarianceMxNFunc> VarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_c),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
+                      VarianceParams(7, 6, &aom_variance128x64_c),
+                      VarianceParams(6, 7, &aom_variance64x128_c),
+                      VarianceParams(7, 5, &aom_variance128x32_c),
+                      VarianceParams(5, 7, &aom_variance32x128_c),
+                      VarianceParams(6, 6, &aom_variance64x64_c),
                       VarianceParams(6, 5, &aom_variance64x32_c),
                       VarianceParams(5, 6, &aom_variance32x64_c),
                       VarianceParams(5, 5, &aom_variance32x32_c),
@@ -1154,7 +1186,12 @@
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_sse2),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
+                      VarianceParams(7, 6, &aom_variance128x64_sse2),
+                      VarianceParams(6, 7, &aom_variance64x128_sse2),
+                      VarianceParams(7, 5, &aom_variance128x32_sse2),
+                      VarianceParams(5, 7, &aom_variance32x128_sse2),
+                      VarianceParams(6, 6, &aom_variance64x64_sse2),
                       VarianceParams(6, 5, &aom_variance64x32_sse2),
                       VarianceParams(5, 6, &aom_variance32x64_sse2),
                       VarianceParams(5, 5, &aom_variance32x32_sse2),
@@ -1479,7 +1516,12 @@
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_avx2),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
+                      VarianceParams(7, 6, &aom_variance128x64_avx2),
+                      VarianceParams(6, 7, &aom_variance64x128_avx2),
+                      VarianceParams(7, 5, &aom_variance128x32_avx2),
+                      VarianceParams(5, 7, &aom_variance32x128_avx2),
+                      VarianceParams(6, 6, &aom_variance64x64_avx2),
                       VarianceParams(6, 5, &aom_variance64x32_avx2),
                       VarianceParams(5, 5, &aom_variance32x32_avx2),
                       VarianceParams(5, 4, &aom_variance32x16_avx2),