Add av1_calc_indices_dim2_avx2()

PERF performance: 2.68% for c-code 0.59% for AVX2

Speed Test:
Block Size	Gain for 2 Centroids
8x8		3.03x
8x16		2.99x
8x32		3.03x
16x8		3.01x
16x16		3.0x
16x32		2.84x
32x8		3.04x
32x16		2.92x
32x32		2.97x
32x64		3.04x
64x32		3.07x
64x64		4.59x
16x64		2.95x
64x16		2.99x

Change-Id: I5e7f7e811e15721241b7c34a42d5baaf0b8e7020
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 2264b80..db058fd 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -361,8 +361,12 @@
   }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
+##Krishna SSE2 TODO
 add_proto qw/void av1_calc_indices_dim1/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
-specialize qw/av1_calc_indices_dim1 avx2/;
+  specialize qw/av1_calc_indices_dim1 avx2/;
+
+add_proto qw/void av1_calc_indices_dim2/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
+  specialize qw/av1_calc_indices_dim2 avx2/;
 
   # ENCODEMB INVOKE
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index 1998a8a..84c52a2 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h
@@ -98,7 +98,7 @@
 #if AV1_K_MEANS_DIM - 2
   av1_calc_indices_dim1(data, centroids, indices, n, k);
 #else
-  RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+  av1_calc_indices_dim2(data, centroids, indices, n, k);
 #endif
   int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
@@ -112,7 +112,7 @@
 #if AV1_K_MEANS_DIM - 2
     av1_calc_indices_dim1(data, centroids, indices, n, k);
 #else
-    RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+    av1_calc_indices_dim2(data, centroids, indices, n, k);
 #endif
     this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index b1e1b14..85af473 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -28,9 +28,6 @@
 /*!\cond */
 #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
 
-void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
-                                             const int *centroids,
-                                             uint8_t *indices, int n, int k);
 void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
@@ -61,7 +58,7 @@
   if (dim == 1) {
     av1_calc_indices_dim1(data, centroids, indices, n, k);
   } else if (dim == 2) {
-    av1_calc_indices_dim2_c(data, centroids, indices, n, k);
+    av1_calc_indices_dim2(data, centroids, indices, n, k);
   } else {
     assert(0 && "Untemplated k means dimension");
   }
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c
index a96ed2e..23a7369 100644
--- a/av1/encoder/x86/av1_k_means_avx2.c
+++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -16,7 +16,7 @@
 void av1_calc_indices_dim1_avx2(const int *data, const int *centroids,
                                 uint8_t *indices, int n, int k) {
   __m256i dist[PALETTE_MAX_SIZE];
-  __m256i v_zero = _mm256_setzero_si256();
+  const __m256i v_zero = _mm256_setzero_si256();
 
   for (int i = 0; i < n; i += 8) {
     __m256i ind = _mm256_loadu_si256((__m256i *)data);
@@ -48,3 +48,48 @@
     data += 8;
   }
 }
+
+void av1_calc_indices_dim2_avx2(const int *data, const int *centroids,
+                                uint8_t *indices, int n, int k) {
+  __m256i dist[PALETTE_MAX_SIZE];
+  const __m256i v_zero = _mm256_setzero_si256();
+  const __m256i v_permute = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
+
+  for (int i = 0; i < n; i += 8) {
+    __m256i ind1 = _mm256_loadu_si256((__m256i *)data);
+    __m256i ind2 = _mm256_loadu_si256((__m256i *)(data + 8));
+    for (int j = 0; j < k; j++) {
+      __m128i cent0 = _mm_loadl_epi64((__m128i const *)&centroids[2 * j]);
+      __m256i cent1 = _mm256_inserti128_si256(v_zero, cent0, 0);
+      cent1 = _mm256_inserti128_si256(cent1, cent0, 1);
+      __m256i cent = _mm256_unpacklo_epi64(cent1, cent1);
+      __m256i d1 = _mm256_sub_epi32(ind1, cent);
+      __m256i d2 = _mm256_sub_epi32(ind2, cent);
+      __m256i d3 = _mm256_mullo_epi32(d1, d1);
+      __m256i d4 = _mm256_mullo_epi32(d2, d2);
+      __m256i d5 = _mm256_hadd_epi32(d3, d4);
+      dist[j] = _mm256_permutevar8x32_epi32(d5, v_permute);
+    }
+
+    __m256i ind = _mm256_setzero_si256();
+    for (int j = 1; j < k; j++) {
+      __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
+      __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
+      __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
+      dist[0] = _mm256_or_si256(dist1, dist2);
+      ind1 = _mm256_set1_epi32(j);
+      ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+                            _mm256_and_si256(cmp, ind1));
+    }
+
+    __m256i p1 = _mm256_packus_epi32(ind, v_zero);
+    __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+    __m256i p2 = _mm256_packus_epi16(px, v_zero);
+    __m128i d1 = _mm256_extracti128_si256(p2, 0);
+
+    _mm_storel_epi64((__m128i *)indices, d1);
+
+    indices += 8;
+    data += 16;
+  }
+}
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index cda0c79..754a2da 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -32,14 +32,20 @@
 typedef void (*av1_calc_indices_dim1_func)(const int *data,
                                            const int *centroids,
                                            uint8_t *indices, int n, int k);
+typedef void (*av1_calc_indices_dim2_func)(const int *data,
+                                           const int *centroids,
+                                           uint8_t *indices, int n, int k);
 
 typedef std::tuple<av1_calc_indices_dim1_func, BLOCK_SIZE>
     av1_calc_indices_dim1Param;
 
-class AV1KmeansTest
+typedef std::tuple<av1_calc_indices_dim2_func, BLOCK_SIZE>
+    av1_calc_indices_dim2Param;
+
+class AV1KmeansTest1
     : public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
  public:
-  ~AV1KmeansTest();
+  ~AV1KmeansTest1();
   void SetUp();
 
   void TearDown();
@@ -61,21 +67,18 @@
   }
 
   libaom_test::ACMRandom rnd_;
-  int data_[5096];
+  int data_[4096];
   int centroids_[8];
-  uint8_t indices1_[5096];
-  uint8_t indices2_[5096];
+  uint8_t indices1_[4096];
+  uint8_t indices2_[4096];
 };
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest1);
 
-AV1KmeansTest::~AV1KmeansTest() { ; }
+AV1KmeansTest1::~AV1KmeansTest1() { ; }
 
-void AV1KmeansTest::SetUp() {
+void AV1KmeansTest1::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  /*uint8_t indices1_[5096];
-  uint8_t indices2_[5096];
-  int data_[5096];*/
-  for (int i = 0; i < 5096; ++i) {
+  for (int i = 0; i < 4096; ++i) {
     data_[i] = (int)rnd_.Rand8() << 4;
   }
   for (int i = 0; i < 8; i++) {
@@ -83,21 +86,22 @@
   }
 }
 
-void AV1KmeansTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1KmeansTest1::TearDown() { libaom_test::ClearSystemState(); }
 
-void AV1KmeansTest::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
-                                   BLOCK_SIZE bsize, int k) {
+void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
+                                    BLOCK_SIZE bsize, int k) {
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   const int n = w * h;
   av1_calc_indices_dim1_c(data_, centroids_, indices1_, n, k);
   test_impl(data_, centroids_, indices2_, n, k);
 
-  ASSERT_EQ(CheckResult(n), true) << " block " << bsize << " Centroids " << n;
+  ASSERT_EQ(CheckResult(n), true)
+      << " block " << bsize << " index " << n << " Centroids " << k;
 }
 
-void AV1KmeansTest::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
-                                 BLOCK_SIZE bsize, int k) {
+void AV1KmeansTest1::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
+                                  BLOCK_SIZE bsize, int k) {
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   const int n = w * h;
@@ -121,7 +125,7 @@
   printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
 }
 
-TEST_P(AV1KmeansTest, CheckOutput) {
+TEST_P(AV1KmeansTest1, CheckOutput) {
   // centroids = 2..8
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
@@ -132,7 +136,115 @@
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
 }
 
-TEST_P(AV1KmeansTest, DISABLED_Speed) {
+TEST_P(AV1KmeansTest1, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+class AV1KmeansTest2
+    : public ::testing::TestWithParam<av1_calc_indices_dim2Param> {
+ public:
+  ~AV1KmeansTest2();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+                      int centroids);
+  void RunSpeedTest(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+                    int centroids);
+  bool CheckResult(int n) {
+    bool flag = true;
+    for (int idx = 0; idx < n; ++idx) {
+      if (indices1_[idx] != indices2_[idx]) {
+        printf("%d ", idx);
+        printf("%d != %d ", indices1_[idx], indices2_[idx]);
+        flag = false;
+      }
+    }
+    if (flag == false) {
+      return false;
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  int data_[4096 * 2];
+  int centroids_[8 * 2];
+  uint8_t indices1_[4096];
+  uint8_t indices2_[4096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest2);
+
+AV1KmeansTest2::~AV1KmeansTest2() { ; }
+
+void AV1KmeansTest2::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  for (int i = 0; i < 4096 * 2; ++i) {
+    data_[i] = (int)rnd_.Rand8();
+  }
+  for (int i = 0; i < 8 * 2; i++) {
+    centroids_[i] = (int)rnd_.Rand8();
+  }
+}
+
+void AV1KmeansTest2::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
+                                    BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  av1_calc_indices_dim2_c(data_, centroids_, indices1_, n, k);
+  test_impl(data_, centroids_, indices2_, n, k);
+
+  ASSERT_EQ(CheckResult(n), true)
+      << " block " << bsize << " index " << n << " Centroids " << k;
+}
+
+void AV1KmeansTest2::RunSpeedTest(av1_calc_indices_dim2_func test_impl,
+                                  BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  const int num_loops = 1000000000 / n;
+
+  av1_calc_indices_dim2_func funcs[2] = { av1_calc_indices_dim2_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    av1_calc_indices_dim2_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(data_, centroids_, indices1_, n, k);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_calc_indices_dim2 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest2, CheckOutput) {
+  // centroids = 2..8
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest2, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
   RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
   RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
@@ -150,9 +262,13 @@
                                        BLOCK_16X64, BLOCK_64X16 };
 
 INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1KmeansTest,
+    AVX2, AV1KmeansTest1,
     ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
                        ::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1KmeansTest2,
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
 #endif
 
 }  // namespace AV1Kmeans