rtc: Add intrinsics for aom_get_var_sse_sum_16x16_dual()

Introduced SSE2 and NEON intrinsics to compute variance, sse and
sum of two 16x16 blocks and added relevant unit test changes.

Change-Id: I7a633440ddceeab48e0dd4c3b4315a9fcf726468
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e2c9764..12d48be 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1311,7 +1311,7 @@
   specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon/;
 
   add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
-  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2/;
+  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2 sse2 neon/;
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 3cc66a0..7021eca 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -405,6 +405,25 @@
     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
 }
 
+void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         uint32_t *sse16x16,
+                                         unsigned int *tot_sse, int *tot_sum,
+                                         uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over 2 16x16 blocks. Process one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
+                       16, &sse16x16[k], &sum16x16[k]);
+  }
+
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++)
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
 #if defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 7d4ff4f..83dd6e8 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -46,6 +46,12 @@
   return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
 }
 
+static INLINE void load16_8to16_sse2(const uint8_t *const p, __m128i *out) {
+  const __m128i p0 = _mm_loadu_si128((const __m128i *)p);
+  out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128());  // lower 8 values
+  out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128());  // upper 8 values
+}
+
 // Accumulate 4 32bit numbers in val to 1 32bit number
 static INLINE unsigned int add32x4_sse2(__m128i val) {
   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
@@ -271,6 +277,42 @@
     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
 }
 
+void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         uint32_t *sse16x16,
+                                         unsigned int *tot_sse, int *tot_sum,
+                                         uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over 2 16x16 blocks. Process one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    const uint8_t *src = src_ptr;
+    const uint8_t *ref = ref_ptr;
+    __m128i vsum = _mm_setzero_si128();
+    __m128i vsse = _mm_setzero_si128();
+    for (int i = 0; i < 16; i++) {
+      __m128i s[2];
+      __m128i r[2];
+      load16_8to16_sse2(src + (k * 16), s);
+      load16_8to16_sse2(ref + (k * 16), r);
+      const __m128i diff0 = _mm_sub_epi16(s[0], r[0]);
+      const __m128i diff1 = _mm_sub_epi16(s[1], r[1]);
+      vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+      vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+      vsum = _mm_add_epi16(vsum, _mm_add_epi16(diff0, diff1));
+      src += src_stride;
+      ref += ref_stride;
+    }
+    variance_final_256_pel_sse2(vsse, vsum, &sse16x16[k], &sum16x16[k]);
+  }
+
+  // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++)
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
 #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
   unsigned int aom_variance##bw##x##bh##_sse2(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 6b7fd13..93c2e2f 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -42,6 +42,11 @@
                                      uint32_t *sse8x8, int *sum8x8,
                                      unsigned int *tot_sse, int *tot_sum,
                                      uint32_t *var8x8);
+typedef void (*GetSseSum16x16DualFunc)(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse16x16,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var16x16);
 typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
                                          int xoffset, int yoffset,
                                          const uint8_t *b, int b_stride,
@@ -707,6 +712,12 @@
   void MaxTestSseSum();
   void SseSum_SpeedTest();
 
+  // SSE&SUM dual tests
+  void RefTestSseSumDual();
+  void MinTestSseSumDual();
+  void MaxTestSseSumDual();
+  void SseSum_SpeedTestDual();
+
   // MSE/SSE tests
   void RefTestMse();
   void RefTestSse();
@@ -1022,6 +1033,171 @@
       width(), height(), elapsed_time_ref, elapsed_time_simd,
       elapsed_time_ref / elapsed_time_simd);
 }
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::RefTestSseSumDual() {
+  for (int iter = 0; iter < 10; ++iter) {
+    for (int idx = 0; idx < block_size(); ++idx) {
+      src_[idx] = rnd_.Rand8();
+      ref_[idx] = rnd_.Rand8();
+    }
+    unsigned int sse1[64] = { 0 };
+    unsigned int sse2[64] = { 0 };
+    unsigned int var1[64] = { 0 };
+    unsigned int var2[64] = { 0 };
+    unsigned int sse_tot_c = 0;
+    unsigned int sse_tot_simd = 0;
+    int sum_tot_c = 0;
+    int sum_tot_simd = 0;
+    const int stride = width();
+    int k = 0;
+
+    for (int row = 0; row < height(); row += 16) {
+      for (int col = 0; col < width(); col += 32) {
+        API_REGISTER_STATE_CHECK(params_.func(
+            src_ + stride * row + col, stride, ref_ + stride * row + col,
+            stride, &sse1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+        aom_get_var_sse_sum_16x16_dual_c(
+            src_ + stride * row + col, stride, ref_ + stride * row + col,
+            stride, &sse2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+        k += 2;
+      }
+    }
+    EXPECT_EQ(sse_tot_c, sse_tot_simd);
+    EXPECT_EQ(sum_tot_c, sum_tot_simd);
+    for (int p = 0; p < 64; p++) {
+      EXPECT_EQ(sse1[p], sse2[p]);
+      EXPECT_EQ(sse_tot_simd, sse_tot_c);
+      EXPECT_EQ(sum_tot_simd, sum_tot_c);
+      EXPECT_EQ(var1[p], var2[p]);
+    }
+  }
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::MinTestSseSumDual() {
+  memset(src_, 0, block_size());
+  memset(ref_, 255, block_size());
+  unsigned int sse1[64] = { 0 };
+  unsigned int sse2[64] = { 0 };
+  unsigned int var1[64] = { 0 };
+  unsigned int var2[64] = { 0 };
+  unsigned int sse_tot_c = 0;
+  unsigned int sse_tot_simd = 0;
+  int sum_tot_c = 0;
+  int sum_tot_simd = 0;
+  const int stride = width();
+  int k = 0;
+
+  for (int row = 0; row < height(); row += 16) {
+    for (int col = 0; col < width(); col += 32) {
+      API_REGISTER_STATE_CHECK(params_.func(
+          src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+          &sse1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+      aom_get_var_sse_sum_16x16_dual_c(
+          src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+          &sse2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+      k += 2;
+    }
+  }
+  EXPECT_EQ(sse_tot_simd, sse_tot_c);
+  EXPECT_EQ(sum_tot_simd, sum_tot_c);
+  for (int p = 0; p < 64; p++) {
+    EXPECT_EQ(sse1[p], sse2[p]);
+    EXPECT_EQ(var1[p], var2[p]);
+  }
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::MaxTestSseSumDual() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
+  unsigned int sse1[64] = { 0 };
+  unsigned int sse2[64] = { 0 };
+  unsigned int var1[64] = { 0 };
+  unsigned int var2[64] = { 0 };
+  unsigned int sse_tot_c = 0;
+  unsigned int sse_tot_simd = 0;
+  int sum_tot_c = 0;
+  int sum_tot_simd = 0;
+  const int stride = width();
+  int k = 0;
+
+  for (int row = 0; row < height(); row += 16) {
+    for (int col = 0; col < width(); col += 32) {
+      API_REGISTER_STATE_CHECK(params_.func(
+          src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+          &sse1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+      aom_get_var_sse_sum_16x16_dual_c(
+          src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+          &sse2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+      k += 2;
+    }
+  }
+  EXPECT_EQ(sse_tot_c, sse_tot_simd);
+  EXPECT_EQ(sum_tot_c, sum_tot_simd);
+
+  for (int p = 0; p < 64; p++) {
+    EXPECT_EQ(sse1[p], sse2[p]);
+    EXPECT_EQ(var1[p], var2[p]);
+  }
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::SseSum_SpeedTestDual() {
+  const int loop_count = 1000000000 / block_size();
+  for (int idx = 0; idx < block_size(); ++idx) {
+    src_[idx] = rnd_.Rand8();
+    ref_[idx] = rnd_.Rand8();
+  }
+
+  unsigned int sse1[2] = { 0 };
+  unsigned int sse2[2] = { 0 };
+  unsigned int var1[2] = { 0 };
+  unsigned int var2[2] = { 0 };
+  unsigned int sse_tot_c = 0;
+  unsigned int sse_tot_simd = 0;
+  int sum_tot_c = 0;
+  int sum_tot_simd = 0;
+  const int stride = width();
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int r = 0; r < loop_count; ++r) {
+    for (int row = 0; row < height(); row += 16) {
+      for (int col = 0; col < width(); col += 32) {
+        aom_get_var_sse_sum_16x16_dual_c(src_ + stride * row + col, stride,
+                                         ref_ + stride * row + col, stride,
+                                         sse2, &sse_tot_c, &sum_tot_c, var2);
+      }
+    }
+  }
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time_ref =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  aom_usec_timer_start(&timer);
+  for (int r = 0; r < loop_count; ++r) {
+    for (int row = 0; row < height(); row += 16) {
+      for (int col = 0; col < width(); col += 32) {
+        params_.func(src_ + stride * row + col, stride,
+                     ref_ + stride * row + col, stride, sse1, &sse_tot_simd,
+                     &sum_tot_simd, var1);
+      }
+    }
+  }
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time_simd =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  printf(
+      "aom_getvar_16x16_dual for block=%dx%d : ref_time=%lf \t simd_time=%lf "
+      "\t "
+      "gain=%lf \n",
+      width(), height(), elapsed_time_ref, elapsed_time_simd,
+      elapsed_time_ref / elapsed_time_simd);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
@@ -1500,6 +1676,7 @@
 typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
 typedef MainTestClass<GetSseSum8x8QuadFunc> GetSseSum8x8QuadTest;
+typedef MainTestClass<GetSseSum16x16DualFunc> GetSseSum16x16DualTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
 typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
@@ -1528,6 +1705,10 @@
 TEST_P(GetSseSum8x8QuadTest, MinSseSum) { MinTestSseSum(); }
 TEST_P(GetSseSum8x8QuadTest, MaxMseSum) { MaxTestSseSum(); }
 TEST_P(GetSseSum8x8QuadTest, DISABLED_Speed) { SseSum_SpeedTest(); }
+TEST_P(GetSseSum16x16DualTest, RefMseSum) { RefTestSseSumDual(); }
+TEST_P(GetSseSum16x16DualTest, MinSseSum) { MinTestSseSumDual(); }
+TEST_P(GetSseSum16x16DualTest, MaxMseSum) { MaxTestSseSumDual(); }
+TEST_P(GetSseSum16x16DualTest, DISABLED_Speed) { SseSum_SpeedTestDual(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
@@ -1610,6 +1791,17 @@
 INSTANTIATE_TEST_SUITE_P(C, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_c));
 
+typedef TestParams<GetSseSum16x16DualFunc> GetSseSumParamsDual;
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_c[] = {
+  GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_c, 0),
+  GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_c, 0),
+  GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_c, 0),
+  GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_c, 0)
+};
+
+INSTANTIATE_TEST_SUITE_P(C, GetSseSum16x16DualTest,
+                         ::testing::ValuesIn(kArrayGetSseSum16x16Dual_c));
+
 typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
 const SubpelVarianceParams kArraySubpelVariance_c[] = {
   SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
@@ -2351,6 +2543,15 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_sse2));
 
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_sse2[] = {
+  GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_sse2, 0),
+  GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_sse2, 0),
+  GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_sse2, 0),
+  GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_sse2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum16x16DualTest,
+                         ::testing::ValuesIn(kArrayGetSseSum16x16Dual_sse2));
+
 const SubpelVarianceParams kArraySubpelVariance_sse2[] = {
   SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
   SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
@@ -2969,6 +3170,15 @@
 INSTANTIATE_TEST_SUITE_P(AVX2, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_avx2));
 
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_avx2[] = {
+  GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_avx2, 0),
+  GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_avx2, 0),
+  GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_avx2, 0),
+  GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_avx2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, GetSseSum16x16DualTest,
+                         ::testing::ValuesIn(kArrayGetSseSum16x16Dual_avx2));
+
 const SubpelVarianceParams kArraySubpelVariance_avx2[] = {
   SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
   SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
@@ -3123,6 +3333,15 @@
 INSTANTIATE_TEST_SUITE_P(NEON, GetSseSum8x8QuadTest,
                          ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon));
 
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_neon[] = {
+  GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_neon, 0),
+  GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_neon, 0),
+  GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_neon, 0),
+  GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_neon, 0)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, GetSseSum16x16DualTest,
+                         ::testing::ValuesIn(kArrayGetSseSum16x16Dual_neon));
+
 #if CONFIG_AV1_HIGHBITDEPTH
 const VarianceParams kArrayHBDVariance_neon[] = {
   VarianceParams(7, 7, &aom_highbd_10_variance128x128_neon, 10),