Reuse inter prediction result in real-time speed 6
In real-time speed 6, no partition search is done. The inter
prediction results got from picking mode can be reused in the
following encoding process. A speed feature reuse_inter_pred_sby
is added to only enable the resue in speed 6.
This patch doesn't change encoding result. RTC set tests showed
that the encoding speed gain is 2% - 5%.
Change-Id: I3884780f64ef95dd8be10562926542528713b92c
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 001ac69..e90b8dd 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3364,7 +3364,10 @@
vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
&xd->block_refs[ref]->sf);
}
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+ if (!cpi->sf.reuse_inter_pred_sby)
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+
+ vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
if (!x->skip) {
mbmi->skip = 1;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index f3b2d2f..b621da3 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -23,6 +23,7 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rdopt.h"
@@ -183,6 +184,22 @@
*out_dist_sum += dist << 4;
}
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (!p[i].in_use) {
+ p[i].in_use = 1;
+ return i;
+ }
+ }
+ return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+ p->in_use = 0;
+}
+
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -229,6 +246,31 @@
const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
get_chessboard_index(cm)) % 2;
+ // For speed 6, the result of interp filter is reused later in actual encoding
+ // process.
+ int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+ int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+ int pixels_in_block = bh * bw;
+ // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
+ PRED_BUFFER tmp[4];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+ struct buf_2d orig_dst = pd->dst;
+ PRED_BUFFER *best_pred = NULL;
+ PRED_BUFFER *this_mode_pred = NULL;
+ int i;
+
+ if (cpi->sf.reuse_inter_pred_sby) {
+ for (i = 0; i < 3; i++) {
+ tmp[i].data = &pred_buf[pixels_in_block * i];
+ tmp[i].stride = bw;
+ tmp[i].in_use = 0;
+ }
+
+ tmp[3].data = pd->dst.buf;
+ tmp[3].stride = pd->dst.stride;
+ tmp[3].in_use = 0;
+ }
+
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
x->skip = 0;
@@ -324,6 +366,16 @@
// Search for the best prediction filter type, when the resulting
// motion vector is at sub-pixel accuracy level for luma component, i.e.,
// the last three bits are all zeros.
+ if (cpi->sf.reuse_inter_pred_sby) {
+ if (this_mode == NEARESTMV) {
+ this_mode_pred = &tmp[3];
+ } else {
+ this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = bw;
+ }
+ }
+
if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
pred_filter_search &&
((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
@@ -334,6 +386,7 @@
unsigned int pf_sse[3];
int64_t best_cost = INT64_MAX;
INTERP_FILTER best_filter = SWITCHABLE, filter;
+ PRED_BUFFER *current_pred = this_mode_pred;
for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
int64_t cost;
@@ -345,12 +398,28 @@
vp9_get_switchable_rate(cpi) + pf_rate[filter],
pf_dist[filter]);
if (cost < best_cost) {
- best_filter = filter;
- best_cost = cost;
- skip_txfm = x->skip_txfm;
+ best_filter = filter;
+ best_cost = cost;
+ skip_txfm = x->skip_txfm;
+
+ if (cpi->sf.reuse_inter_pred_sby) {
+ if (this_mode_pred != current_pred) {
+ free_pred_buffer(this_mode_pred);
+ this_mode_pred = current_pred;
+ }
+
+ if (filter < EIGHTTAP_SHARP) {
+ current_pred = &tmp[get_pred_buffer(tmp, 3)];
+ pd->dst.buf = current_pred->data;
+ pd->dst.stride = bw;
+ }
+ }
}
}
+ if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
+ free_pred_buffer(current_pred);
+
mbmi->interp_filter = best_filter;
rate = pf_rate[mbmi->interp_filter];
dist = pf_dist[mbmi->interp_filter];
@@ -451,6 +520,16 @@
best_pred_filter = mbmi->interp_filter;
best_ref_frame = ref_frame;
skip_txfm = x->skip_txfm;
+
+ if (cpi->sf.reuse_inter_pred_sby) {
+ if (best_pred != NULL)
+ free_pred_buffer(best_pred);
+
+ best_pred = this_mode_pred;
+ }
+ } else {
+ if (cpi->sf.reuse_inter_pred_sby)
+ free_pred_buffer(this_mode_pred);
}
if (x->skip)
@@ -458,6 +537,19 @@
}
}
+ // If best prediction is not in dst buf, then copy the prediction block from
+ // temp buf to dst buf.
+ if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
+ uint8_t *copy_from, *copy_to;
+
+ pd->dst = orig_dst;
+ copy_to = pd->dst.buf;
+
+ copy_from = best_pred->data;
+
+ vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
+ bw, bh);
+ }
mbmi->mode = best_mode;
mbmi->interp_filter = best_pred_filter;
@@ -471,12 +563,21 @@
if (!x->skip && best_rd > inter_mode_thresh &&
bsize <= cpi->sf.max_intra_bsize) {
for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+ if (cpi->sf.reuse_inter_pred_sby) {
+ pd->dst.buf = tmp[0].data;
+ pd->dst.stride = bw;
+ }
+
vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
mbmi->tx_size, this_mode,
&p->src.buf[0], p->src.stride,
&pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+
+ if (cpi->sf.reuse_inter_pred_sby)
+ pd->dst = orig_dst;
+
rate += cpi->mbmode_cost[this_mode];
rate += intra_cost_penalty;
this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -494,6 +595,7 @@
}
}
}
+
#if CONFIG_DENOISING
vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
#endif
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index a9c948d..3d89974 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -17,6 +17,12 @@
extern "C" {
#endif
+typedef struct {
+ uint8_t *data;
+ int stride;
+ int in_use;
+} PRED_BUFFER;
+
int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
const struct TileInfo *const tile,
int mi_row, int mi_col,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 83d900d..c38323f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -274,6 +274,9 @@
// is checked for a partition block. Later, we can try to allow large
// partitions to do intra mode checking.
sf->max_intra_bsize = BLOCK_8X8;
+
+ // This feature is only enabled when partition search is disabled.
+ sf->reuse_inter_pred_sby = 1;
}
if (speed >= 7) {
@@ -339,6 +342,7 @@
for (i = 0; i < BLOCK_SIZES; ++i)
sf->inter_mode_mask[i] = INTER_ALL;
sf->max_intra_bsize = BLOCK_64X64;
+ sf->reuse_inter_pred_sby = 0;
// This setting only takes effect when partition_search_type is set
// to FIXED_PARTITION.
sf->always_this_block_size = BLOCK_16X16;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c796421..8750eaf 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -353,6 +353,11 @@
// The threshold used in SOURCE_VAR_BASED_PARTITION search type.
unsigned int source_var_thresh;
+
+ // When partition is pre-set, the inter prediction result from pick_inter_mode
+ // can be reused in final block encoding process. It is enabled only for real-
+ // time mode speed 6.
+ int reuse_inter_pred_sby;
} SPEED_FEATURES;
struct VP9_COMP;