SIMD updates & unit testing Change-Id: I6e35237c3045d0b421300ae6f62c3fcc9d1f7e70
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 9ad5d4b..a1be01d 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -551,7 +551,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { if (aom_config("CONFIG_EXTQUANT") eq "yes") { add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b sse2 avx2/; + specialize qw/aom_highbd_quantize_b sse2/; add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 4f48663..13d6530 100644 --- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -89,7 +89,11 @@ int k = 4 * i + j; const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; +#if CONFIG_EXTQUANT + const int32_t abs_qcoeff = +#else const uint32_t abs_qcoeff = +#endif (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; #if CONFIG_EXTQUANT
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c index bed144e..b63a9a6 100644 --- a/av1/encoder/x86/av1_highbd_quantize_avx2.c +++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -74,7 +74,7 @@ __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); #if CONFIG_EXTQUANT - const __m256i round = _mm256_set1_epi64x((1 << QUANT_TABLE_BITS) >> 1); + const __m256i round = _mm256_set1_epi32((1 << QUANT_TABLE_BITS) >> 1); #endif __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc index 5714ba4..d319f1e 100644 --- a/test/av1_quantize_test.cc +++ b/test/av1_quantize_test.cc
@@ -21,13 +21,21 @@ #include "av1/common/scan.h" namespace { - +#if CONFIG_EXTQUANT +typedef void (*QuantizeFpFunc)( + const tran_low_t *coeff_ptr, intptr_t count, const int32_t *zbin_ptr, + const int32_t *round_ptr, const int32_t *quant_ptr, + const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale); +#else typedef void (*QuantizeFpFunc)( const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale); +#endif struct QuantizeFuncParams { QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL, @@ -42,14 +50,31 @@ const int numTests = 1000; const int maxSize = 1024; +#if CONFIG_EXTQUANT +const int roundFactorRange = 64; +const int dequantRange = 1048576; +#else const int roundFactorRange = 127; const int dequantRange = 32768; +#endif const int coeffRange = (1 << 20) - 1; class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> { public: void RunQuantizeTest() { ACMRandom rnd(ACMRandom::DeterministicSeed()); +#if CONFIG_EXTQUANT + DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int32_t, zbin_ptr[8]); + DECLARE_ALIGNED(16, int32_t, round_ptr[8]); + DECLARE_ALIGNED(16, int32_t, quant_ptr[8]); + DECLARE_ALIGNED(16, int32_t, quant_shift_ptr[8]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int32_t, dequant_ptr[8]); +#else DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]); DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]); DECLARE_ALIGNED(16, int16_t, round_ptr[8]); @@ -60,6 +85,7 @@ DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]); DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]); DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]); +#endif uint16_t eob; uint16_t ref_eob; int err_count_total = 0; @@ -79,12 +105,23 @@ } for (int j = 0; j < 2; j++) { +#if CONFIG_EXTQUANT + zbin_ptr[j] = rnd.Rand31(); + quant_shift_ptr[j] = rnd.Rand31(); + // int32_t positive + dequant_ptr[j] = abs(rnd(dequantRange)); + quant_ptr[j] = static_cast<int32_t>( + (1 << (16 + QUANT_FP_BITS + QUANT_TABLE_BITS)) / dequant_ptr[j]); + round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> + (7 + QUANT_TABLE_BITS); +#else zbin_ptr[j] = rnd.Rand16(); quant_shift_ptr[j] = rnd.Rand16(); // int16_t positive dequant_ptr[j] = abs(rnd(dequantRange)); quant_ptr[j] = static_cast<int16_t>((1 << 16) / dequant_ptr[j]); round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7; +#endif } for (int j = 2; j < 8; ++j) { zbin_ptr[j] = zbin_ptr[1]; @@ -125,6 +162,18 @@ void RunEobTest() { ACMRandom rnd(ACMRandom::DeterministicSeed()); +#if CONFIG_EXTQUANT + DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int32_t, zbin_ptr[8]); + DECLARE_ALIGNED(16, int32_t, round_ptr[8]); + DECLARE_ALIGNED(16, int32_t, quant_ptr[8]); + DECLARE_ALIGNED(16, int32_t, quant_shift_ptr[8]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int32_t, dequant_ptr[8]); +#else DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]); DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]); DECLARE_ALIGNED(16, int16_t, round_ptr[8]); @@ -135,6 +184,7 @@ DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]); DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]); DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]); +#endif uint16_t eob; uint16_t ref_eob; int count = params_.coeffCount; @@ -155,12 +205,23 @@ coeff_ptr[rnd(count)] = rnd(coeffRange); for (int j = 0; j < 2; j++) { +#if CONFIG_EXTQUANT + zbin_ptr[j] = rnd.Rand31(); + quant_shift_ptr[j] = rnd.Rand31(); + // int32_t positive + dequant_ptr[j] = abs(rnd(dequantRange)); + quant_ptr[j] = + ((1 << (16 + QUANT_FP_BITS + QUANT_TABLE_BITS)) / dequant_ptr[j]); + round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> + (7 + QUANT_TABLE_BITS); +#else zbin_ptr[j] = rnd.Rand16(); quant_shift_ptr[j] = rnd.Rand16(); // int16_t positive dequant_ptr[j] = abs(rnd(dequantRange)); quant_ptr[j] = (1 << 16) / dequant_ptr[j]; round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7; +#endif } for (int j = 2; j < 8; ++j) { zbin_ptr[j] = zbin_ptr[1]; @@ -207,7 +268,7 @@ TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); } TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); } -#if HAVE_SSE4_1 && !CONFIG_EXTQUANT +#if HAVE_SSE4_1 && CONFIG_EXTQUANT const QuantizeFuncParams qfps[4] = { QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c, 16), @@ -222,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps)); #endif // HAVE_SSE4_1 -#if HAVE_AVX2 && !CONFIG_EXTQUANT +#if HAVE_AVX2 && CONFIG_EXTQUANT const QuantizeFuncParams qfps_avx2[4] = { QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c, 16),
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc index c1e5539..ba2698b 100644 --- a/test/quantize_func_test.cc +++ b/test/quantize_func_test.cc
@@ -439,6 +439,38 @@ INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest, ::testing::ValuesIn(kQParamArrayAvx2)); +#elif HAVE_AVX2 && CONFIG_EXTQUANT +const QuantizeParam kQParamArrayAvx2[] = { + make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8), + make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_10), + make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12), + make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8), + make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_10), + make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12), + make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8), + make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_10), + make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>, + &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, + static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12), +}; +INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest, + ::testing::ValuesIn(kQParamArrayAvx2)); #endif // HAVE_AVX2 #if HAVE_SSE2 && !CONFIG_EXTQUANT @@ -528,9 +560,33 @@ INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest, ::testing::ValuesIn(kQParamArraySSE2)); +#elif HAVE_SSE2 && CONFIG_EXTQUANT +const QuantizeParam kQParamArraySSE2[] = { + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, + static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, + static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, + static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, + static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, + static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, + static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest, + ::testing::ValuesIn(kQParamArraySSE2)); #endif -#if HAVE_NEON +#if HAVE_NEON && !CONFIG_EXTQUANT const QuantizeParam kQParamArrayNEON[] = { make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon, static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),