Add SSE2 optimization for av1_quantize_lp Added SSE2 optimization for av1_quantize_lp. Speed test results (SSE2 vs c) at function level: TX_16X16: 10.42X TX_8X8: 8.78X TX_4X4: 5.75X Change-Id: Ib3d6adac01dfa9bf854eb244c3d429bb021da9ea (cherry picked from commit 982274586148709b81b42f29b9b0636f8d1cc052)
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 6ea67e1..24d9f31 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -331,7 +331,7 @@ specialize qw/av1_quantize_fp sse2 avx2 neon/; add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/av1_quantize_lp avx2 neon/; + specialize qw/av1_quantize_lp sse2 avx2 neon/; add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/av1_quantize_fp_32x32 neon avx2/;
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c index 5497c7e..b533894 100644 --- a/av1/encoder/x86/av1_quantize_sse2.c +++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -15,6 +15,7 @@ #include "config/av1_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, __m128i *c0, __m128i *c1) { @@ -187,3 +188,102 @@ *eob_ptr = _mm_extract_epi16(eob, 1); } } + +static INLINE void quantize_lp(const int16_t *iscan_ptr, + const int16_t *coeff_ptr, intptr_t n_coeffs, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + __m128i *eob) { + const int16_t *read = coeff_ptr + n_coeffs; + __m128i coeff0 = _mm_load_si128((const __m128i *)read); + __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + int16_t *addr = qcoeff_ptr + n_coeffs; + _mm_store_si128((__m128i *)addr, qcoeff0); + _mm_store_si128((__m128i *)addr + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + addr = dqcoeff_ptr + n_coeffs; + _mm_store_si128((__m128i *)addr, coeff0); + _mm_store_si128((__m128i *)addr + 1, coeff1); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); +} + +void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + __m128i eob = _mm_setzero_si128(); + + // DC and first 15 AC + quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &eob); + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + *eob_ptr = accumulate_eob(eob); +}
diff --git a/test/quantize_lp_func_test.cc b/test/quantize_lp_func_test.cc index 850ec94..f398623 100644 --- a/test/quantize_lp_func_test.cc +++ b/test/quantize_lp_func_test.cc
@@ -334,6 +334,20 @@ ::testing::ValuesIn(kQParamArrayAVX2)); #endif +#if HAVE_SSE2 +const QuantizeParam kQParamArraySSE2[] = { + make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2, + static_cast<TX_SIZE>(TX_16X16), AOM_BITS_8), + make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2, + static_cast<TX_SIZE>(TX_8X8), AOM_BITS_8), + make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2, + static_cast<TX_SIZE>(TX_4X4), AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, FullPrecisionQuantizeLpTest, + ::testing::ValuesIn(kQParamArraySSE2)); +#endif + #if HAVE_NEON const QuantizeParam kQParamArrayNEON[] = { make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_neon,