Fixed a computation bug in fdct16_sse2() fdct16_sse2() was not bit-exact with C reference, fdct16(). The inconsistency was found by writing a unit test for vp10_fht16x16_sse2(). Since the unit test needs a pending change on the inherited base class. I will commit this unit test after making a header file for this base class. Passed the uncommitted unit test: vp10_fht16x16_test.cc. Change-Id: If2b617883c633a3ea90c19e1d018240c8007102b
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c index 79d1e88..aaf1e6a 100644 --- a/vp10/encoder/x86/dct_sse2.c +++ b/vp10/encoder/x86/dct_sse2.c
@@ -1635,7 +1635,7 @@ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -1839,10 +1839,10 @@ v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); - v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); - v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); - v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); + v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); + v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); + v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); @@ -1872,10 +1872,10 @@ // stage 5 s[0] = _mm_add_epi16(p[0], t[1]); s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_add_epi16(p[3], t[2]); - s[3] = _mm_sub_epi16(p[3], t[2]); - s[4] = _mm_sub_epi16(p[4], t[5]); - s[5] = _mm_add_epi16(p[4], t[5]); + s[2] = _mm_sub_epi16(p[3], t[2]); + s[3] = _mm_add_epi16(p[3], t[2]); + s[4] = _mm_add_epi16(p[4], t[5]); + s[5] = _mm_sub_epi16(p[4], t[5]); s[6] = _mm_sub_epi16(p[7], t[6]); s[7] = _mm_add_epi16(p[7], t[6]);