Fix bug in av1_highbd_quantize_fp_avx2/sse4
1. Both av1_highbd_quantize_fp_avx2/sse4 missed the
comparing of abs_coeff<<(1+log_scale) and dequant.
When abs_coeff<<(1+log_scale) less than dequant,
output should be zero.
2. Fix test case of AV1QuantizeTest, the input data
arrary needs to extend to 8 element for SIMD.
3. Re-enable sse4_1 version AV1QuantizeTest.BitExactCheck
and AV1QuantizeTest.EobVerify.
And add avx2 version test for them.
4. Re-enable SIMD optimization, for encoder, about 4.2%
faster shows by encoding 10 frames of city_cif, when
CONFIG_LOWBITDEPTH set to 0.
a) gcc (Ubuntu 7.2.0-8ubuntu3.2) 7.2.0
b) CPU: Intel(R) Core(TM) i7-6900K CPU @ 3.20GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=0
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=10
Change-Id: I05c2203ccdf078efa3a0cf227c03797859c355e3
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 119b4f4..757c9f0 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -405,13 +405,10 @@
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qparam->log_scale);
return;
}
-
- // TODO(yunqing): modify the AVX2 version to match the c version, and
- // then turn on it and also enable its unit test.
- av1_highbd_quantize_fp_c(
- coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_fp_QTX,
- p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qparam->log_scale);
+ av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+ p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX,
+ qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qparam->log_scale);
}
}
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c
index 52d3437..20d4160 100644
--- a/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -59,6 +59,9 @@
q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
q_hi = _mm256_slli_epi64(q_hi, 32);
q = _mm256_or_si256(q_lo, q_hi);
+ const __m256i abs_s = _mm256_slli_epi32(abs, 1 + log_scale);
+ const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+ q = _mm256_andnot_si256(mask, q);
__m256i dq = _mm256_mullo_epi32(q, qp[2]);
dq = _mm256_srai_epi32(dq, log_scale);
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 3ca7581..da8bb37 100644
--- a/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -36,6 +36,8 @@
qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
dquan[0] = _mm_srli_epi64(dquan[0], scale);
+ const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+ qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
}
// Coefficient quantization phase 2
@@ -70,7 +72,8 @@
qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
dquan[0] = _mm_sign_epi32(dquan[0], *sign);
-
+ qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+ dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
_mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
_mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
}
@@ -113,7 +116,7 @@
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan, int log_scale) {
- __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+ __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
__m128i eob = _mm_setzero_si128();
const tran_low_t *src = coeff_ptr;
tran_low_t *quanAddr = qcoeff_ptr;
@@ -137,6 +140,8 @@
qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
+ qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+ dequant_ptr[0]);
// DC and first 3 AC
quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
@@ -146,7 +151,7 @@
qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
-
+ qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr, dquanAddr);
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 86a48f2..f594a64 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -50,15 +50,15 @@
void RunQuantizeTest() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
- DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+ DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
- DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+ DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
uint16_t eob;
uint16_t ref_eob;
int err_count_total = 0;
@@ -86,7 +86,13 @@
quant_ptr[j] = (1 << 16) / dequant_ptr[j];
round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
}
-
+ for (int j = 2; j < 8; ++j) {
+ zbin_ptr[j] = zbin_ptr[1];
+ quant_shift_ptr[j] = quant_shift_ptr[1];
+ dequant_ptr[j] = dequant_ptr[1];
+ quant_ptr[j] = quant_ptr[1];
+ round_ptr[j] = round_ptr[1];
+ }
quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
&ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
@@ -99,7 +105,7 @@
for (int j = 0; j < count; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
- EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+ ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
<< "qcoeff error: i = " << i << " j = " << j << "\n";
EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
<< "dqcoeff error: i = " << i << " j = " << j << "\n";
@@ -120,15 +126,15 @@
void RunEobTest() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
- DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
- DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+ DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
- DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+ DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
uint16_t eob;
uint16_t ref_eob;
int skip_block = 0;
@@ -157,6 +163,13 @@
quant_ptr[j] = (1 << 16) / dequant_ptr[j];
round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
}
+ for (int j = 2; j < 8; ++j) {
+ zbin_ptr[j] = zbin_ptr[1];
+ quant_shift_ptr[j] = quant_shift_ptr[1];
+ dequant_ptr[j] = dequant_ptr[1];
+ quant_ptr[j] = quant_ptr[1];
+ round_ptr[j] = round_ptr[1];
+ }
quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
@@ -191,8 +204,8 @@
QuantizeFuncParams params_;
};
-TEST_P(AV1QuantizeTest, DISABLED_BitExactCheck) { RunQuantizeTest(); }
-TEST_P(AV1QuantizeTest, DISABLED_EobVerify) { RunEobTest(); }
+TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
+TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
#if HAVE_SSE4_1
const QuantizeFuncParams qfps[4] = {
@@ -208,4 +221,20 @@
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const QuantizeFuncParams qfps_avx2[4] = {
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 16),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 64),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 256),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 1024),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+#endif // HAVE_AVX2
+
} // namespace