AVX2: Remove some permutes in av1_quantize_lp_avx2 for TX_4X4
Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|------------|----------|----------|---------|-------|
| 5 | rtc | +0.000% | +0.000% | +0.000% | +0.0% |
| 5 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.2% |
| 5 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.0% |
| 5 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|------------|----------|----------|---------|-------|
| 6 | rtc | +0.000% | +0.000% | +0.000% | +0.0% |
| 6 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.2% |
| 6 | rtc_derf | +0.000% | +0.000% | +0.000% | +0.0% |
| 6 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|------------|----------|----------|---------|-------|
| 7 | rtc | +0.000% | +0.000% | +0.000% | -0.2% |
| 7 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.2% |
| 7 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.2% |
| 7 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
| 8 | rtc | +0.000% | +0.000% | +0.000% | -0.2% |
| 8 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.2% |
| 8 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.2% |
| 8 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
| 9 | rtc | +0.000% | +0.000% | +0.000% | -0.2% |
| 9 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.2% |
| 9 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.2% |
| 9 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|------------|----------|----------|---------|-------|
| 10 | rtc | +0.000% | +0.000% | +0.000% | -0.2% |
| 10 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.2% |
| 10 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.2% |
| 10 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.0% |
Change-Id: I5e838985c5ec03a604cd4609ab9d938f2d866cf7
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 8c4e395..75c5172 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -98,6 +98,28 @@
return _mm_extract_epi16(eob, 1);
}
+static AOM_FORCE_INLINE void quantize_lp_16_first(
+ const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256,
+ __m256i *dequant256, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff);
+
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+ *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
static AOM_FORCE_INLINE void quantize_lp_16(
const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256,
@@ -143,29 +165,21 @@
quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
- coeff_ptr += n_coeffs;
- iscan += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
-
// Process DC and the first 15 AC coeffs.
- quantize_lp_16(coeff_ptr, n_coeffs, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
- &quant256, &dequant256, &eob256);
+ quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+ &quant256, &dequant256, &eob256);
- // Overwrite the DC constants with AC constants
- dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
- quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
- round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+ if (n_coeffs > 16) {
+ // Overwrite the DC constants with AC constants
+ dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+ quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+ round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
- n_coeffs += 8 * 2;
-
- // AC only loop.
- while (n_coeffs < 0) {
- quantize_lp_16(coeff_ptr, n_coeffs, iscan, qcoeff_ptr, dqcoeff_ptr,
- &round256, &quant256, &dequant256, &eob256);
-
- n_coeffs += 8 * 2;
+ // AC only loop.
+ for (int idx = 16; idx < n_coeffs; idx += 16) {
+ quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+ &quant256, &dequant256, &eob256);
+ }
}
*eob_ptr = accumulate_eob256(eob256);