Add av1_calc_indices_dim2_avx2()
PERF performance: 2.68% for c-code 0.59% for AVX2
Speed Test:
Block Size Gain for 2 Centroids
8x8 3.03x
8x16 2.99x
8x32 3.03x
16x8 3.01x
16x16 3.0x
16x32 2.84x
32x8 3.04x
32x16 2.92x
32x32 2.97x
32x64 3.04x
64x32 3.07x
64x64 4.59x
16x64 2.95x
64x16 2.99x
Change-Id: I5e7f7e811e15721241b7c34a42d5baaf0b8e7020
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 2264b80..db058fd 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -361,8 +361,12 @@
}
add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+##Krishna SSE2 TODO
add_proto qw/void av1_calc_indices_dim1/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
-specialize qw/av1_calc_indices_dim1 avx2/;
+ specialize qw/av1_calc_indices_dim1 avx2/;
+
+add_proto qw/void av1_calc_indices_dim2/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
+ specialize qw/av1_calc_indices_dim2 avx2/;
# ENCODEMB INVOKE
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index 1998a8a..84c52a2 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h
@@ -98,7 +98,7 @@
#if AV1_K_MEANS_DIM - 2
av1_calc_indices_dim1(data, centroids, indices, n, k);
#else
- RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+ av1_calc_indices_dim2(data, centroids, indices, n, k);
#endif
int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
@@ -112,7 +112,7 @@
#if AV1_K_MEANS_DIM - 2
av1_calc_indices_dim1(data, centroids, indices, n, k);
#else
- RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+ av1_calc_indices_dim2(data, centroids, indices, n, k);
#endif
this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index b1e1b14..85af473 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -28,9 +28,6 @@
/*!\cond */
#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
-void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
- const int *centroids,
- uint8_t *indices, int n, int k);
void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
uint8_t *indices, int n, int k,
int max_itr);
@@ -61,7 +58,7 @@
if (dim == 1) {
av1_calc_indices_dim1(data, centroids, indices, n, k);
} else if (dim == 2) {
- av1_calc_indices_dim2_c(data, centroids, indices, n, k);
+ av1_calc_indices_dim2(data, centroids, indices, n, k);
} else {
assert(0 && "Untemplated k means dimension");
}
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c
index a96ed2e..23a7369 100644
--- a/av1/encoder/x86/av1_k_means_avx2.c
+++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -16,7 +16,7 @@
void av1_calc_indices_dim1_avx2(const int *data, const int *centroids,
uint8_t *indices, int n, int k) {
__m256i dist[PALETTE_MAX_SIZE];
- __m256i v_zero = _mm256_setzero_si256();
+ const __m256i v_zero = _mm256_setzero_si256();
for (int i = 0; i < n; i += 8) {
__m256i ind = _mm256_loadu_si256((__m256i *)data);
@@ -48,3 +48,48 @@
data += 8;
}
}
+
+void av1_calc_indices_dim2_avx2(const int *data, const int *centroids,
+ uint8_t *indices, int n, int k) {
+ __m256i dist[PALETTE_MAX_SIZE];
+ const __m256i v_zero = _mm256_setzero_si256();
+ const __m256i v_permute = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
+
+ for (int i = 0; i < n; i += 8) {
+ __m256i ind1 = _mm256_loadu_si256((__m256i *)data);
+ __m256i ind2 = _mm256_loadu_si256((__m256i *)(data + 8));
+ for (int j = 0; j < k; j++) {
+ __m128i cent0 = _mm_loadl_epi64((__m128i const *)¢roids[2 * j]);
+ __m256i cent1 = _mm256_inserti128_si256(v_zero, cent0, 0);
+ cent1 = _mm256_inserti128_si256(cent1, cent0, 1);
+ __m256i cent = _mm256_unpacklo_epi64(cent1, cent1);
+ __m256i d1 = _mm256_sub_epi32(ind1, cent);
+ __m256i d2 = _mm256_sub_epi32(ind2, cent);
+ __m256i d3 = _mm256_mullo_epi32(d1, d1);
+ __m256i d4 = _mm256_mullo_epi32(d2, d2);
+ __m256i d5 = _mm256_hadd_epi32(d3, d4);
+ dist[j] = _mm256_permutevar8x32_epi32(d5, v_permute);
+ }
+
+ __m256i ind = _mm256_setzero_si256();
+ for (int j = 1; j < k; j++) {
+ __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
+ __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
+ __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
+ dist[0] = _mm256_or_si256(dist1, dist2);
+ ind1 = _mm256_set1_epi32(j);
+ ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+ _mm256_and_si256(cmp, ind1));
+ }
+
+ __m256i p1 = _mm256_packus_epi32(ind, v_zero);
+ __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+ __m256i p2 = _mm256_packus_epi16(px, v_zero);
+ __m128i d1 = _mm256_extracti128_si256(p2, 0);
+
+ _mm_storel_epi64((__m128i *)indices, d1);
+
+ indices += 8;
+ data += 16;
+ }
+}
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index cda0c79..754a2da 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -32,14 +32,20 @@
typedef void (*av1_calc_indices_dim1_func)(const int *data,
const int *centroids,
uint8_t *indices, int n, int k);
+typedef void (*av1_calc_indices_dim2_func)(const int *data,
+ const int *centroids,
+ uint8_t *indices, int n, int k);
typedef std::tuple<av1_calc_indices_dim1_func, BLOCK_SIZE>
av1_calc_indices_dim1Param;
-class AV1KmeansTest
+typedef std::tuple<av1_calc_indices_dim2_func, BLOCK_SIZE>
+ av1_calc_indices_dim2Param;
+
+class AV1KmeansTest1
: public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
public:
- ~AV1KmeansTest();
+ ~AV1KmeansTest1();
void SetUp();
void TearDown();
@@ -61,21 +67,18 @@
}
libaom_test::ACMRandom rnd_;
- int data_[5096];
+ int data_[4096];
int centroids_[8];
- uint8_t indices1_[5096];
- uint8_t indices2_[5096];
+ uint8_t indices1_[4096];
+ uint8_t indices2_[4096];
};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest1);
-AV1KmeansTest::~AV1KmeansTest() { ; }
+AV1KmeansTest1::~AV1KmeansTest1() { ; }
-void AV1KmeansTest::SetUp() {
+void AV1KmeansTest1::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
- /*uint8_t indices1_[5096];
- uint8_t indices2_[5096];
- int data_[5096];*/
- for (int i = 0; i < 5096; ++i) {
+ for (int i = 0; i < 4096; ++i) {
data_[i] = (int)rnd_.Rand8() << 4;
}
for (int i = 0; i < 8; i++) {
@@ -83,21 +86,22 @@
}
}
-void AV1KmeansTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1KmeansTest1::TearDown() { libaom_test::ClearSystemState(); }
-void AV1KmeansTest::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
- BLOCK_SIZE bsize, int k) {
+void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
+ BLOCK_SIZE bsize, int k) {
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
const int n = w * h;
av1_calc_indices_dim1_c(data_, centroids_, indices1_, n, k);
test_impl(data_, centroids_, indices2_, n, k);
- ASSERT_EQ(CheckResult(n), true) << " block " << bsize << " Centroids " << n;
+ ASSERT_EQ(CheckResult(n), true)
+ << " block " << bsize << " index " << n << " Centroids " << k;
}
-void AV1KmeansTest::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
- BLOCK_SIZE bsize, int k) {
+void AV1KmeansTest1::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
+ BLOCK_SIZE bsize, int k) {
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
const int n = w * h;
@@ -121,7 +125,7 @@
printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
}
-TEST_P(AV1KmeansTest, CheckOutput) {
+TEST_P(AV1KmeansTest1, CheckOutput) {
// centroids = 2..8
RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
@@ -132,7 +136,115 @@
RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
}
-TEST_P(AV1KmeansTest, DISABLED_Speed) {
+TEST_P(AV1KmeansTest1, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+class AV1KmeansTest2
+ : public ::testing::TestWithParam<av1_calc_indices_dim2Param> {
+ public:
+ ~AV1KmeansTest2();
+ void SetUp();
+
+ void TearDown();
+
+ protected:
+ void RunCheckOutput(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+ int centroids);
+ void RunSpeedTest(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+ int centroids);
+ bool CheckResult(int n) {
+ bool flag = true;
+ for (int idx = 0; idx < n; ++idx) {
+ if (indices1_[idx] != indices2_[idx]) {
+ printf("%d ", idx);
+ printf("%d != %d ", indices1_[idx], indices2_[idx]);
+ flag = false;
+ }
+ }
+ if (flag == false) {
+ return false;
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ int data_[4096 * 2];
+ int centroids_[8 * 2];
+ uint8_t indices1_[4096];
+ uint8_t indices2_[4096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest2);
+
+AV1KmeansTest2::~AV1KmeansTest2() { ; }
+
+void AV1KmeansTest2::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ for (int i = 0; i < 4096 * 2; ++i) {
+ data_[i] = (int)rnd_.Rand8();
+ }
+ for (int i = 0; i < 8 * 2; i++) {
+ centroids_[i] = (int)rnd_.Rand8();
+ }
+}
+
+void AV1KmeansTest2::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
+ BLOCK_SIZE bsize, int k) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int n = w * h;
+ av1_calc_indices_dim2_c(data_, centroids_, indices1_, n, k);
+ test_impl(data_, centroids_, indices2_, n, k);
+
+ ASSERT_EQ(CheckResult(n), true)
+ << " block " << bsize << " index " << n << " Centroids " << k;
+}
+
+void AV1KmeansTest2::RunSpeedTest(av1_calc_indices_dim2_func test_impl,
+ BLOCK_SIZE bsize, int k) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int n = w * h;
+ const int num_loops = 1000000000 / n;
+
+ av1_calc_indices_dim2_func funcs[2] = { av1_calc_indices_dim2_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ av1_calc_indices_dim2_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(data_, centroids_, indices1_, n, k);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("av1_calc_indices_dim2 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+ elapsed_time[0], elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest2, CheckOutput) {
+ // centroids = 2..8
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest2, DISABLED_Speed) {
RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
@@ -150,9 +262,13 @@
BLOCK_16X64, BLOCK_64X16 };
INSTANTIATE_TEST_SUITE_P(
- AVX2, AV1KmeansTest,
+ AVX2, AV1KmeansTest1,
::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1KmeansTest2,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_avx2),
+ ::testing::ValuesIn(kValidBlockSize)));
#endif
} // namespace AV1Kmeans