Refactor and speed up av1_optimize_txb()
Added update_coeff_eob_facade() and update_coeff_simple_facade()
Refactor update_coeff_eob() and update_coeff_simple() to eliminate
redundant computations.
Encoder performance results averaged over all resolutions:
Encoder Instruction
CPU Count Reduction (%)
1 1.17
2 1.03
3 0.85
4 0.50
5 0.57
6 0.46
This change is bit-exact for all presets.
Change-Id: Ib00ae2d15bbd06ac1d65b921171a4ab7b5035744
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index 6f0fdff..cf76ffd 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -80,7 +80,6 @@
const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
uint8_t *levels, int sharpness, const qm_val_t *iqmatrix,
const qm_val_t *qmatrix) {
- const int dqv = get_dqv(dequant, scan[si], iqmatrix);
(void)eob;
// this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
// and not the last (scan_idx != eob - 1)
@@ -104,6 +103,7 @@
return;
}
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
const int64_t rd = RDCOST(rdmult, rate, dist);
@@ -135,7 +135,6 @@
const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
- const int dqv = get_dqv(dequant, scan[si], iqmatrix);
assert(si != *eob - 1);
const int ci = scan[si];
const tran_low_t qc = qcoeff[ci];
@@ -144,6 +143,7 @@
if (qc == 0) {
*accu_rate += txb_costs->base_cost[coeff_ctx][0];
} else {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
int lower_level = 0;
const tran_low_t abs_qc = abs(qc);
const tran_low_t tqc = tcoeff[ci];
@@ -299,6 +299,36 @@
return 0;
}
+static AOM_FORCE_INLINE void update_coeff_eob_facade(
+ int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+ int *si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width,
+ int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+ const qm_val_t *iqmatrix, const qm_val_t *qmatrix, int max_nz_num) {
+ for (; *si >= 0 && *nz_num <= max_nz_num; --*si) {
+ update_coeff_eob(accu_rate, accu_dist, eob, nz_num, nz_ci, *si, tx_size,
+ tx_class, bhl, width, dc_sign_ctx, rdmult, shift, dequant,
+ scan, txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels, sharpness, iqmatrix, qmatrix);
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple_facade(
+ int *accu_rate, int *si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ int bhl, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+ const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ uint8_t *levels, int sharpness, const qm_val_t *iqmatrix,
+ const qm_val_t *qmatrix) {
+ for (; *si >= 1; --*si) {
+ update_coeff_simple(accu_rate, *si, eob, tx_size, tx_class, bhl, rdmult,
+ shift, dequant, scan, txb_costs, tcoeff, qcoeff,
+ dqcoeff, levels, sharpness, iqmatrix, qmatrix);
+ }
+}
+
int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
int block, TX_SIZE tx_size, TX_TYPE tx_type,
const TXB_CTX *const txb_ctx, int *rate_cost,
@@ -400,13 +430,11 @@
#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \
case tx_class_literal: \
- for (; si >= 0 && nz_num <= max_nz_num; --si) { \
- update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \
- tx_size, tx_class_literal, bhl, width, \
- txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
- txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \
- levels, sharpness, iqmatrix, qmatrix); \
- } \
+ update_coeff_eob_facade( \
+ &accu_rate, &accu_dist, &eob, &nz_num, nz_ci, &si, tx_size, \
+ tx_class_literal, bhl, width, txb_ctx->dc_sign_ctx, rdmult, shift, \
+ dequant, scan, txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \
+ levels, sharpness, iqmatrix, qmatrix, max_nz_num); \
break
switch (tx_class) {
UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
@@ -421,14 +449,12 @@
non_skip_cost, qcoeff, dqcoeff);
}
-#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \
- case tx_class_literal: \
- for (; si >= 1; --si) { \
- update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \
- rdmult, shift, dequant, scan, txb_costs, tcoeff, \
- qcoeff, dqcoeff, levels, sharpness, iqmatrix, \
- qmatrix); \
- } \
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ update_coeff_simple_facade(&accu_rate, &si, eob, tx_size, \
+ tx_class_literal, bhl, rdmult, shift, dequant, \
+ scan, txb_costs, tcoeff, qcoeff, dqcoeff, \
+ levels, sharpness, iqmatrix, qmatrix); \
break
switch (tx_class) {
UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);