Error correction for SSE2 k_means

Speed Test:
av1_calc_indices_dim1 : 3.75x
av1_calc_indices_dim2 : 4.89x

BUG = aomedia:2942

Change-Id: I64cd7042754a9a0812f774906e7c02ac26ca2992
diff --git a/av1/encoder/x86/av1_k_means_sse2.c b/av1/encoder/x86/av1_k_means_sse2.c
index 10efc9c..43f661f 100644
--- a/av1/encoder/x86/av1_k_means_sse2.c
+++ b/av1/encoder/x86/av1_k_means_sse2.c
@@ -27,7 +27,10 @@
     for (int j = 0; j < k; j++) {
       __m128i cent = _mm_set1_epi32((uint32_t)centroids[j]);
       __m128i d1 = _mm_sub_epi32(ind[l], cent);
-      dist[j] = _mm_madd_epi16(d1, d1);
+      __m128i d2 = _mm_packs_epi32(d1, d1);
+      __m128i d3 = _mm_mullo_epi16(d2, d2);
+      __m128i d4 = _mm_mulhi_epi16(d2, d2);
+      dist[j] = _mm_unpacklo_epi16(d3, d4);
     }
 
     ind[l] = _mm_setzero_si128();
@@ -39,8 +42,8 @@
       __m128i ind1 = _mm_set1_epi32(j);
       ind[l] =
           _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
-      ind[l] = _mm_packus_epi16(ind[l], v_zero);
     }
+    ind[l] = _mm_packus_epi16(ind[l], v_zero);
     if (l == 1) {
       __m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
       _mm_storel_epi64((__m128i *)indices, p2);
@@ -84,8 +87,8 @@
       ind1 = _mm_set1_epi32(j);
       ind[l] =
           _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
-      ind[l] = _mm_packus_epi16(ind[l], v_zero);
     }
+    ind[l] = _mm_packus_epi16(ind[l], v_zero);
     if (l == 1) {
       __m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
       _mm_storel_epi64((__m128i *)indices, p2);
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index 32cf644..c158766 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc
@@ -254,7 +254,7 @@
   RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
 }
 
-#if HAVE_SSE2 || HAVE_AVX2
+#if HAVE_AVX2 || HAVE_SSE2
 const BLOCK_SIZE kValidBlockSize[] = { BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X32,
                                        BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32,
                                        BLOCK_32X8,  BLOCK_32X16, BLOCK_32X32,
@@ -277,11 +277,11 @@
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1KmeansTest1,
-    ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_sse2),
                        ::testing::ValuesIn(kValidBlockSize)));
 INSTANTIATE_TEST_SUITE_P(
     SSE2, AV1KmeansTest2,
-    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_avx2),
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_sse2),
                        ::testing::ValuesIn(kValidBlockSize)));
 #endif