Error correction for SSE2 k_means
Speed Test:
av1_calc_indices_dim1 : 3.75x
av1_calc_indices_dim2 : 4.89x
BUG = aomedia:2942
Change-Id: I64cd7042754a9a0812f774906e7c02ac26ca2992
diff --git a/av1/encoder/x86/av1_k_means_sse2.c b/av1/encoder/x86/av1_k_means_sse2.c
index 10efc9c..43f661f 100644
--- a/av1/encoder/x86/av1_k_means_sse2.c
+++ b/av1/encoder/x86/av1_k_means_sse2.c
@@ -27,7 +27,10 @@
for (int j = 0; j < k; j++) {
__m128i cent = _mm_set1_epi32((uint32_t)centroids[j]);
__m128i d1 = _mm_sub_epi32(ind[l], cent);
- dist[j] = _mm_madd_epi16(d1, d1);
+ __m128i d2 = _mm_packs_epi32(d1, d1);
+ __m128i d3 = _mm_mullo_epi16(d2, d2);
+ __m128i d4 = _mm_mulhi_epi16(d2, d2);
+ dist[j] = _mm_unpacklo_epi16(d3, d4);
}
ind[l] = _mm_setzero_si128();
@@ -39,8 +42,8 @@
__m128i ind1 = _mm_set1_epi32(j);
ind[l] =
_mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
- ind[l] = _mm_packus_epi16(ind[l], v_zero);
}
+ ind[l] = _mm_packus_epi16(ind[l], v_zero);
if (l == 1) {
__m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
_mm_storel_epi64((__m128i *)indices, p2);
@@ -84,8 +87,8 @@
ind1 = _mm_set1_epi32(j);
ind[l] =
_mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
- ind[l] = _mm_packus_epi16(ind[l], v_zero);
}
+ ind[l] = _mm_packus_epi16(ind[l], v_zero);
if (l == 1) {
__m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
_mm_storel_epi64((__m128i *)indices, p2);
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index 32cf644..c158766 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -254,7 +254,7 @@
RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
}
-#if HAVE_SSE2 || HAVE_AVX2
+#if HAVE_AVX2 || HAVE_SSE2
const BLOCK_SIZE kValidBlockSize[] = { BLOCK_8X8, BLOCK_8X16, BLOCK_8X32,
BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
@@ -277,11 +277,11 @@
INSTANTIATE_TEST_SUITE_P(
SSE2, AV1KmeansTest1,
- ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_sse2),
::testing::ValuesIn(kValidBlockSize)));
INSTANTIATE_TEST_SUITE_P(
SSE2, AV1KmeansTest2,
- ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_avx2),
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_sse2),
::testing::ValuesIn(kValidBlockSize)));
#endif