rtc: Add AVX2 variant for functions related to motion search

This CL Adds AVX2 for aom_int_pro_row() and aom_int_pro_col()
functions. Also, refactored the existing code to make it AVX2
friendly.

The overall encode time reduction for RT preset is listed below
       Encode_time
cpu    Reduction(%)
 7       1.150
 8       1.331

Change-Id: Idebe6dd72933674148bcc41785f9b42d93dc2f11
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 93f4c34..bcbf1b9 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,7 @@
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockSize = 128 * 128;
 
   virtual void SetUp() {
     const testing::TestInfo *const test_info =
@@ -343,20 +343,32 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
-                              const int ref_stride, const int height);
+typedef void (*IntProRowFunc)(int16_t *hbuf, uint8_t const *ref,
+                              const int ref_stride, const int width,
+                              const int height, int norm_factor);
 
-// Params: height, asm function, c function.
-typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProRowFunc, IntProRowFunc> IntProRowParam;
 
 class IntProRowTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProRowParam> {
  public:
   IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+      : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), hbuf_asm_(nullptr),
         hbuf_c_(nullptr) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
+    asm_func_ = GET_PARAM(2);
+    c_func_ = GET_PARAM(3);
+  }
+
+  void set_norm_factor() {
+    if (height_ == 128)
+      norm_factor_ = 6;
+    else if (height_ == 64)
+      norm_factor_ = 5;
+    else if (height_ == 32)
+      norm_factor_ = 4;
+    else if (height_ == 16)
+      norm_factor_ = 3;
   }
 
  protected:
@@ -366,10 +378,10 @@
     ASSERT_NE(source_data_, nullptr);
 
     hbuf_asm_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * width_));
     ASSERT_NE(hbuf_asm_, nullptr);
     hbuf_c_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * width_));
     ASSERT_NE(hbuf_c_, nullptr);
   }
 
@@ -383,19 +395,24 @@
   }
 
   void RunComparison() {
-    API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+    set_norm_factor();
+    API_REGISTER_STATE_CHECK(
+        c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, width_, width_,
+                                       height_, norm_factor_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
         << "Output mismatch\n";
   }
 
   void RunSpeedTest() {
     const int numIter = 5000000;
-    printf("Height = %d number of iteration is %d \n", height_, numIter);
+    set_norm_factor();
+    printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+           numIter);
     aom_usec_timer c_timer_;
     aom_usec_timer_start(&c_timer_);
     for (int i = 0; i < numIter; i++) {
-      c_func_(hbuf_c_, source_data_, 0, height_);
+      c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&c_timer_);
 
@@ -403,7 +420,7 @@
     aom_usec_timer_start(&asm_timer_);
 
     for (int i = 0; i < numIter; i++) {
-      asm_func_(hbuf_asm_, source_data_, 0, height_);
+      asm_func_(hbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&asm_timer_);
 
@@ -415,7 +432,7 @@
            asm_sum_time,
            (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
 
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
         << "Output mismatch\n";
   }
 
@@ -424,35 +441,68 @@
   IntProRowFunc c_func_;
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
+  int norm_factor_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+typedef void (*IntProColFunc)(int16_t *vbuf, uint8_t const *ref,
+                              const int ref_stride, const int width,
+                              const int height, int norm_factor);
 
-// Params: width, asm function, c function.
-typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProColFunc, IntProColFunc> IntProColParam;
 
 class IntProColTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProColParam> {
  public:
-  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
+  IntProColTest()
+      : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), vbuf_asm_(nullptr),
+        vbuf_c_(nullptr) {
+    asm_func_ = GET_PARAM(2);
+    c_func_ = GET_PARAM(3);
   }
 
  protected:
+  virtual void SetUp() {
+    source_data_ = static_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_NE(source_data_, nullptr);
+
+    vbuf_asm_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*vbuf_asm_) * width_));
+    ASSERT_NE(vbuf_asm_, nullptr);
+    vbuf_c_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*vbuf_c_) * width_));
+    ASSERT_NE(vbuf_c_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = nullptr;
+    aom_free(vbuf_c_);
+    vbuf_c_ = nullptr;
+    aom_free(vbuf_asm_);
+    vbuf_asm_ = nullptr;
+  }
+
   void RunComparison() {
-    API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+    int norm_factor_ = 3 + (width_ >> 5);
+    API_REGISTER_STATE_CHECK(
+        c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+    API_REGISTER_STATE_CHECK(asm_func_(vbuf_asm_, source_data_, width_, width_,
+                                       height_, norm_factor_));
+    EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+        << "Output mismatch\n";
   }
   void RunSpeedTest() {
     const int numIter = 5000000;
-    printf("Width = %d number of iteration is %d \n", width_, numIter);
+    printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+           numIter);
+    int norm_factor_ = 3 + (width_ >> 5);
     aom_usec_timer c_timer_;
     aom_usec_timer_start(&c_timer_);
     for (int i = 0; i < numIter; i++) {
-      sum_c_ = c_func_(source_data_, width_);
+      c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&c_timer_);
 
@@ -460,7 +510,7 @@
     aom_usec_timer_start(&asm_timer_);
 
     for (int i = 0; i < numIter; i++) {
-      sum_asm_ = asm_func_(source_data_, width_);
+      asm_func_(vbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&asm_timer_);
 
@@ -472,14 +522,15 @@
            asm_sum_time,
            (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
 
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+    EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+        << "Output mismatch\n";
   }
 
  private:
   IntProColFunc asm_func_;
   IntProColFunc c_func_;
-  int16_t sum_asm_;
-  int16_t sum_c_;
+  int16_t *vbuf_asm_;
+  int16_t *vbuf_c_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
 
@@ -703,19 +754,19 @@
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_sse2,
-                                 &aom_int_pro_row_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_sse2, &aom_int_pro_row_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_sse2,
-                                 &aom_int_pro_col_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_sse2, &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_AVX2
@@ -724,6 +775,22 @@
     ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
                       make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
                       make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IntProRowTest,
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_avx2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IntProColTest,
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_avx2, &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_NEON
@@ -737,19 +804,19 @@
                       make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
 INSTANTIATE_TEST_SUITE_P(
     NEON, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_neon,
-                                 &aom_int_pro_row_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_neon, &aom_int_pro_row_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_neon,
-                                 &aom_int_pro_col_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_neon, &aom_int_pro_col_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, AvgTest8bpp_avg_8x8_quad,