Implement av1_quantize_fp_64x64_avx2
1. Implement av1_quantize_fp_64x64_avx2
2. The perf result shows the CPU time of
quantize_fp_helper_c drop from 1.01% to
0.65%(+0.11% for av1_quantize_fp_64x64_avx2)
3. Refactor: extract quant_gather_eob
a) gcc (Ubuntu 7.2.0-8ubuntu3.2) 7.2.0
b) CPU: Intel(R) Core(TM) i7-6900K CPU @ 3.20GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=10
Change-Id: Ic1d07bc2b995a71876cf61d9bcc72895592ccb59
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index df09142..4a37d23 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -173,7 +173,7 @@
specialize qw/av1_quantize_fp_32x32 avx2/;
add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
+ specialize qw/av1_quantize_fp_64x64 avx2/;
# fdct functions
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 078a675..fb18829 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -57,7 +57,7 @@
init_one_qp(&round, &qp[0]);
init_one_qp(&quant, &qp[1]);
- if (log_scale > 0) {
+ if (log_scale == 1) {
qp[1] = _mm256_slli_epi16(qp[1], log_scale);
}
@@ -94,6 +94,15 @@
} \
} while (0)
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+ __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+ eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+ eob_s = _mm_minpos_epu16(eob_s);
+ return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
const int16_t *iscan_ptr, tran_low_t *qcoeff,
tran_low_t *dqcoeff, __m256i *eob) {
@@ -163,18 +172,7 @@
iscan_ptr += step;
n_coeffs -= step;
}
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
+ *eob_ptr = quant_gather_eob(eob);
} else {
do {
write_zero(qcoeff_ptr);
@@ -261,18 +259,98 @@
iscan_ptr += step;
n_coeffs -= step;
}
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ *eob_ptr = quant_gather_eob(eob);
+ } else {
+ do {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ n_coeffs -= step;
+ } while (n_coeffs > 0);
+ *eob_ptr = 0;
+ }
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+ __m256i *c, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs, qp[0]);
+ __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+ __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+ qh = _mm256_slli_epi16(qh, 2);
+ ql = _mm256_srli_epi16(ql, 14);
+ q = _mm256_or_si256(qh, ql);
+ const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+ const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+ __m256i dq = _mm256_or_si256(dqh, dql);
+
+ q = _mm256_sign_epi16(q, *c);
+ dq = _mm256_sign_epi16(dq, *c);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
+ } else {
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ if (LIKELY(!skip_block)) {
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 2;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
}
+ *eob_ptr = quant_gather_eob(eob);
} else {
do {
write_zero(qcoeff_ptr);
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 9cfc52e..71c8d94 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -301,6 +301,8 @@
AOM_BITS_8),
make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, TX_64X64,
+ TYPE_FP, AOM_BITS_8),
make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
TYPE_FP, AOM_BITS_8),