Speed up av1_optimize_b
1. Remove calling of get_txb_ctx from av1_optimize_b
to it's caller, outside of rdo loops.
2. Remove calling of av1_get_tx_type, tx_type can be
passed in by it's caller.
3. For encoder, about 1.3% faster shows by encoding
20 frame of BasketballDrill_832x480_50.y4m, with no
coding loss. ( 601278 ms -> 592634 ms)
a) gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
b) CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=20
Change-Id: I755b337e29316f4ceed37c9b669aebb4ad2d5fac
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 69c8eda..cea8db6 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -97,25 +97,23 @@
}
int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
- int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
- const ENTROPY_CONTEXT *l, int fast_mode, int *rate_cost) {
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int fast_mode,
+ int *rate_cost) {
MACROBLOCKD *const xd = &mb->e_mbd;
struct macroblock_plane *const p = &mb->plane[plane];
const int eob = p->eobs[block];
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
const int segment_id = xd->mi[0]->segment_id;
if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
xd->lossless[segment_id]) {
- *rate_cost = av1_cost_skip_txb(mb, &txb_ctx, plane, tx_size);
+ *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
return eob;
}
(void)fast_mode;
- return av1_optimize_txb_new(cpi, mb, plane, blk_row, blk_col, block, tx_size,
- &txb_ctx, rate_cost, cpi->oxcf.sharpness);
+ return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+ rate_cost, cpi->oxcf.sharpness);
}
typedef enum QUANT_FUNC {
@@ -234,8 +232,10 @@
if (args->enable_optimize_b) {
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, tx_type, AV1_XFORM_QUANT_FP);
- av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
- tx_size, a, l, 1, &dummy_rate_cost);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+ &dummy_rate_cost);
} else {
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
@@ -526,8 +526,10 @@
if (args->enable_optimize_b) {
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, tx_type, AV1_XFORM_QUANT_FP);
- av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
- tx_size, a, l, 1, &dummy_rate_cost);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+ &dummy_rate_cost);
} else {
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 1be2ce0..673f87e 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -15,6 +15,7 @@
#include "config/aom_config.h"
#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
#include "av1/encoder/block.h"
#include "av1/encoder/tokenize.h"
#ifdef __cplusplus
@@ -53,9 +54,8 @@
AV1_XFORM_QUANT xform_quant_idx);
int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
- int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
- const ENTROPY_CONTEXT *l, int fast_mode, int *rate_cost);
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
int blk_col, int blk_row, TX_SIZE tx_size);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 1aadeb1..4d4802b 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -280,7 +280,7 @@
const int is_eob, const TxbInfo *const txb_info,
const LV_MAP_COEFF_COST *const txb_costs,
const int coeff_ctx, const TX_CLASS tx_class) {
- const TXB_CTX *txb_ctx = txb_info->txb_ctx;
+ const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
const int is_nz = (qc != 0);
const tran_low_t abs_qc = abs(qc);
int cost = 0;
@@ -1552,14 +1552,13 @@
}
int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
- int blk_row, int blk_col, int block, TX_SIZE tx_size,
- TXB_CTX *txb_ctx, int *rate_cost, int sharpness) {
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
const PLANE_TYPE plane_type = get_plane_type(plane);
const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
- const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
- tx_size, cm->reduced_tx_set_used);
const TX_CLASS tx_class = tx_type_to_class[tx_type];
const MB_MODE_INFO *mbmi = xd->mi[0];
const struct macroblock_plane *p = &x->plane[plane];
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index 62a6017..aa847ad 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -78,8 +78,8 @@
void hbt_destroy();
int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
- int blk_row, int blk_col, int block, TX_SIZE tx_size,
- TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
#ifdef __cplusplus
}
#endif
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8045e3c..6f4fced 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2516,8 +2516,7 @@
static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- const ENTROPY_CONTEXT *a,
- const ENTROPY_CONTEXT *l,
+ const TXB_CTX *const txb_ctx,
FAST_TX_SEARCH_MODE ftxs_mode,
int use_fast_coef_costing, int64_t ref_best_rd,
RD_STATS *best_rd_stats) {
@@ -2558,9 +2557,7 @@
find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
- cur_joint_ctx = (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
+ cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
if (intra_hash_idx > 0 &&
intra_txb_rd_info->entropy_context == cur_joint_ctx &&
x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
@@ -2679,8 +2676,6 @@
block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
block_sse *= 16;
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
if (!allowed_tx_mask[tx_type]) continue;
if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
@@ -2692,7 +2687,7 @@
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
- tx_size, &txb_ctx, use_fast_coef_costing);
+ tx_size, txb_ctx, use_fast_coef_costing);
} else {
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, tx_type, AV1_XFORM_QUANT_FP);
@@ -2702,15 +2697,15 @@
dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
&this_rd_stats.sse);
rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
- tx_size, &txb_ctx, use_fast_coef_costing);
+ tx_size, txb_ctx, use_fast_coef_costing);
const int64_t rd_estimate =
AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
RDCOST(x->rdmult, 0, this_rd_stats.sse));
if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd))
continue;
}
- av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
- tx_size, a, l, 1, &rate_cost);
+ av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
+ &rate_cost);
}
if (eobs_ptr[block] == 0) {
// When eob is 0, pixel domain distortion is more efficient and accurate.
@@ -2809,8 +2804,8 @@
} else {
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
- av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
- tx_size, a, l, 1, &rate_cost);
+ av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
+ &rate_cost);
}
}
@@ -2868,9 +2863,10 @@
av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
}
-
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
- a, l, args->ftxs_mode, args->use_fast_coef_costing,
+ &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
args->best_rd - args->this_rd, &this_rd_stats);
if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
@@ -4191,7 +4187,7 @@
RD_STATS this_rd_stats;
search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
- a, l, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+ &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
av1_merge_rd_stats(rd_stats, &this_rd_stats);
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 5de6a53..1fa3d68 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -68,8 +68,8 @@
int bsh, int visible_w, int visible_h, int qindex);
#endif
-static INLINE int av1_cost_skip_txb(MACROBLOCK *x, TXB_CTX *txb_ctx, int plane,
- TX_SIZE tx_size) {
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+ int plane, TX_SIZE tx_size) {
const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
const PLANE_TYPE plane_type = get_plane_type(plane);
const LV_MAP_COEFF_COST *const coeff_costs =
@@ -79,7 +79,8 @@
static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
int plane, int blk_row, int blk_col,
- int block, TX_SIZE tx_size, TXB_CTX *txb_ctx,
+ int block, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx,
int use_fast_coef_costing) {
#if TXCOEFF_COST_TIMER
struct aom_usec_timer timer;