Add SIMD support for ext-quant experiment Change-Id: I2c46a098af9f70bd2668e07acc28c0bac01007ab
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h index d3e2c6b..ee385b7 100644 --- a/aom_dsp/aom_dsp_common.h +++ b/aom_dsp/aom_dsp_common.h
@@ -57,6 +57,7 @@ #if CONFIG_EXTQUANT #define QUANT_TABLE_BITS 3 +#define QUANT_FP_BITS 4 #else #define QUANT_TABLE_BITS 0 #endif // CONFIG_EXTQUANT
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 6e102e3..9ad5d4b 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -551,11 +551,19 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { if (aom_config("CONFIG_EXTQUANT") eq "yes") { add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b sse2 avx2/; + + add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32 sse2/; + + add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_64x64 sse2/; + + add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; } else { add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b sse2 avx2/;
diff --git a/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/aom_dsp/x86/highbd_quantize_intrin_avx2.c index 469149c..c816b5f 100644 --- a/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -15,11 +15,21 @@ #include "aom/aom_integer.h" +#if CONFIG_EXTQUANT +static INLINE void init_one_qp(const int32_t *p, __m256i *qp) { +#else static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { +#endif +#if CONFIG_EXTQUANT + const int p1 = p[1]; + const int p0 = p[0]; + *qp = _mm256_set_epi32(p1, p1, p1, p1, p1, p1, p1, p0); +#else const __m128i sign = _mm_srai_epi16(*p, 15); const __m128i dc = _mm_unpacklo_epi16(*p, sign); const __m128i ac = _mm_unpackhi_epi16(*p, sign); *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +#endif } static INLINE void update_qp(__m256i *qp) { @@ -38,6 +48,13 @@ const int16_t *quant_ptr, const int16_t *dequant_ptr, const int16_t *quant_shift_ptr, __m256i *qp) { #endif +#if CONFIG_EXTQUANT + init_one_qp(zbin_ptr, &qp[0]); + init_one_qp(round_ptr, &qp[1]); + init_one_qp(quant_ptr, &qp[2]); + init_one_qp(dequant_ptr, &qp[3]); + init_one_qp(quant_shift_ptr, &qp[4]); +#else const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); @@ -48,6 +65,7 @@ init_one_qp(&quant, &qp[2]); init_one_qp(&dequant, &qp[3]); init_one_qp(&quant_shift, &qp[4]); +#endif } // Note: @@ -77,7 +95,9 @@ __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]); flag2 = _mm256_or_si256(flag1, flag2); const int32_t nzflag = _mm256_movemask_epi8(flag2); - +#if CONFIG_EXTQUANT + const __m256i offset = _mm256_set1_epi32((1 << QUANT_TABLE_BITS) >> 1); +#endif if (LIKELY(nzflag)) { __m256i q = _mm256_add_epi32(abs, qp[1]); __m256i tmp; @@ -86,6 +106,10 @@ mm256_mul_shift_epi32(&q, &qp[4], &q); __m256i dq = _mm256_mullo_epi32(q, qp[3]); +#if CONFIG_EXTQUANT + dq = _mm256_add_epi32(dq, offset); + dq = _mm256_srli_epi32(dq, QUANT_TABLE_BITS); +#endif q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c);
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c index b2fe741..4f48663 100644 --- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,6 +15,16 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#if CONFIG_EXTQUANT +void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + const int32_t *zbin_ptr, + const int32_t *round_ptr, + const int32_t *quant_ptr, + const int32_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int32_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { +#else void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -23,6 +33,7 @@ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { +#endif int i, j, non_zero_regs = (int)count / 4, eob_i = -1; __m128i zbins[2]; __m128i nzbins[2]; @@ -82,8 +93,10 @@ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; #if CONFIG_EXTQUANT + const tran_low_t abs_dqcoeff = (tran_low_t)ROUND_POWER_OF_TWO_64( + abs_qcoeff * dequant_ptr[k != 0], QUANT_TABLE_BITS); dqcoeff_ptr[k] = - qcoeff_ptr[k] * dequant_ptr[k != 0] / (1 << QUANT_TABLE_BITS); + (tran_low_t)((abs_dqcoeff ^ coeff_sign[j]) - coeff_sign[j]); #else dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; #endif @@ -94,12 +107,21 @@ *eob_ptr = eob_i + 1; } +#if CONFIG_EXTQUANT +void aom_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, + const int32_t *round_ptr, const int32_t *quant_ptr, + const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { +#else void aom_highbd_quantize_b_32x32_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { +#endif // CONFIG_EXTQUANT __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; @@ -143,12 +165,19 @@ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; +#if CONFIG_EXTQUANT + const int32_t abs_qcoeff = +#else const uint32_t abs_qcoeff = +#endif (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; #if CONFIG_EXTQUANT - dqcoeff_ptr[rc] = - qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (2 << QUANT_TABLE_BITS); + const tran_low_t abs_dqcoeff = + (tran_low_t)ROUND_POWER_OF_TWO_64(abs_qcoeff * dequant_ptr[rc != 0], + QUANT_TABLE_BITS) >> + 1; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); #else dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; #endif @@ -157,12 +186,21 @@ *eob_ptr = eob + 1; } +#if CONFIG_EXTQUANT +void aom_highbd_quantize_b_64x64_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, + const int32_t *round_ptr, const int32_t *quant_ptr, + const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { +#else void aom_highbd_quantize_b_64x64_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { +#endif // CONFIG_EXTQUANT __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; @@ -206,12 +244,19 @@ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; +#if CONFIG_EXTQUANT + const int32_t abs_qcoeff = +#else const uint32_t abs_qcoeff = +#endif (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; #if CONFIG_EXTQUANT - dqcoeff_ptr[rc] = - qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (4 << QUANT_TABLE_BITS); + const tran_low_t abs_dqcoeff = + (tran_low_t)ROUND_POWER_OF_TWO_64(abs_qcoeff * dequant_ptr[rc != 0], + QUANT_TABLE_BITS) >> + 2; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); #else dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; #endif
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index b61b431..b3d533f 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -371,6 +371,7 @@ if (aom_config("CONFIG_EXTQUANT") eq "yes") { add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale"; + specialize qw/av1_highbd_quantize_fp sse4_1 avx2/; } else { add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale"; specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h index 1d1d0c0..314d4ae 100644 --- a/av1/encoder/av1_quantize.h +++ b/av1/encoder/av1_quantize.h
@@ -43,7 +43,6 @@ const QUANT_PARAM *qparam); #if CONFIG_EXTQUANT -#define QUANT_FP_BITS 4 static const int qindex_10b_offset[] = { 0, 48,
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c index d8809f1..285a106 100644 --- a/av1/encoder/encodemb.c +++ b/av1/encoder/encodemb.c
@@ -297,8 +297,13 @@ if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); if (LIKELY(!x->seg_skip_block)) { +#if CONFIG_EXTQUANT + quant_func_list[qparam->xform_quant_idx][1]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#else quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd]( coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#endif } else { av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); }
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c index cc8d6e2..bed144e 100644 --- a/av1/encoder/x86/av1_highbd_quantize_avx2.c +++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -38,6 +38,22 @@ const int16_t *dequant_ptr, int log_scale, __m256i *qp) { #endif +#if CONFIG_EXTQUANT + const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + qp[0] = _mm256_set_epi32(round1, round1, round1, round1, round1, round1, + round1, round0); + + const int quant1 = quant_ptr[1]; + const int quant0 = quant_ptr[0]; + qp[1] = _mm256_set_epi32(quant1, quant1, quant1, quant1, quant1, quant1, + quant1, quant0); + + const int dequant1 = dequant_ptr[1]; + const int dequant0 = dequant_ptr[0]; + qp[2] = _mm256_set_epi32(dequant1, dequant1, dequant1, dequant1, dequant1, + dequant1, dequant1, dequant0); +#else __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); if (log_scale) { const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); @@ -49,6 +65,7 @@ init_one_qp(&round, &qp[0]); init_one_qp(&quant, &qp[1]); init_one_qp(&dequant, &qp[2]); +#endif } static INLINE void quantize(const __m256i *qp, __m256i *c, @@ -56,14 +73,27 @@ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); +#if CONFIG_EXTQUANT + const __m256i round = _mm256_set1_epi64x((1 << QUANT_TABLE_BITS) >> 1); +#endif __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); +#if CONFIG_EXTQUANT + q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale + QUANT_FP_BITS); + q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale + QUANT_FP_BITS); +#else q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); +#endif + +#if CONFIG_EXTQUANT + log_scale += QUANT_TABLE_BITS; +#endif + q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); @@ -71,6 +101,9 @@ q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); +#if CONFIG_EXTQUANT + dq = _mm256_add_epi64(dq, round); +#endif dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c);
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c index c9c2df1..bd9af6d 100644 --- a/av1/encoder/x86/av1_highbd_quantize_sse4.c +++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -25,6 +25,9 @@ __m128i *sign) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi32(1); +#if CONFIG_EXTQUANT + const __m128i round = _mm_set1_epi64x((1 << QUANT_TABLE_BITS) >> 1); +#endif *sign = _mm_cmplt_epi32(*coeff, zero); *sign = _mm_or_si128(*sign, one); @@ -37,6 +40,9 @@ qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); +#if CONFIG_EXTQUANT + dquan[0] = _mm_add_epi64(dquan[0], round); +#endif dquan[0] = _mm_srli_epi64(dquan[0], scale); const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); @@ -50,10 +56,16 @@ tran_low_t *dqAddr) { __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); +#if CONFIG_EXTQUANT + const __m128i round = _mm_set1_epi64x((1 << QUANT_TABLE_BITS) >> 1); +#endif qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); +#if CONFIG_EXTQUANT + dquan[1] = _mm_add_epi64(dquan[1], round); +#endif dquan[1] = _mm_srli_epi64(dquan[1], scale); // combine L&H @@ -132,7 +144,11 @@ const tran_low_t *src = coeff_ptr; tran_low_t *quanAddr = qcoeff_ptr; tran_low_t *dquanAddr = dqcoeff_ptr; +#if CONFIG_EXTQUANT + const int shift = 16 - log_scale + QUANT_FP_BITS; +#else const int shift = 16 - log_scale; +#endif const int coeff_stride = 4; const int quan_stride = coeff_stride; (void)zbin_ptr; @@ -151,6 +167,9 @@ qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], dequant_ptr[0]); +#if CONFIG_EXTQUANT + log_scale += QUANT_TABLE_BITS; +#endif // DC and first 3 AC quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,