Increase tpl stats storage granularity for high resolution
Previously, tpl stats will be stored at 4x4 granularity.
This patch make the granularity adaptive to source resolution.
At >=720p resolution, tpl stats will be stored at 16x16 granularity,
otherwise, stored at 8x8 granularity, controlled by
cpi->tpl_stats_block_mis_log2.
This will reduce memory usage for 4k encoding by 0.9G.
Coding performance change:
lowres: -0.023%
midres: -0.002%
hdres: -0.008%
netflix4k_8bit: -0.001%
BUG=aomedia:2453
STATS_CHANGED
Change-Id: I4dbcd2c3cec4986fae279d5da2cbd5ede9d4c323
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index d21c126..047ba93 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -66,6 +66,7 @@
#include "av1/encoder/segmentation.h"
#include "av1/encoder/tokenize.h"
#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/tpl_model.h"
static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
@@ -3443,9 +3444,8 @@
int tpl_stride = tpl_frame->stride;
int64_t intra_cost = 0;
int64_t mc_dep_cost = 0;
- int mi_wide = mi_size_wide[bsize];
- int mi_high = mi_size_high[bsize];
- int row, col;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
if (tpl_frame->is_valid == 0) return orig_rdmult;
@@ -3462,10 +3462,12 @@
const int mi_col_end_sr = av1_coded_to_superres_mi(
mi_col + mi_wide, cm->superres_scale_denominator);
const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
- for (row = mi_row; row < mi_row + mi_high; ++row) {
- for (col = mi_col_sr; col < mi_col_end_sr; ++col) {
+ const int step = 1 << cpi->tpl_stats_block_mis_log2;
+ for (int row = mi_row; row < mi_row + mi_high; row += step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
if (row >= cm->mi_rows || col >= mi_cols_sr) continue;
- TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
intra_cost += this_stats->intra_cost;
mc_dep_cost += this_stats->intra_cost + this_stats->mc_flow;
#if !USE_TPL_CLASSIC_MODEL
@@ -3526,9 +3528,8 @@
int tpl_stride = tpl_frame->stride;
int64_t intra_cost = 0;
int64_t mc_dep_cost = 0;
- int mi_wide = mi_size_wide[bsize];
- int mi_high = mi_size_high[bsize];
- int row, col;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
if (cpi->tpl_model_pass == 1) {
assert(cpi->oxcf.enable_tpl_model == 2);
@@ -3550,10 +3551,12 @@
const int mi_col_end_sr = av1_coded_to_superres_mi(
mi_col + mi_wide, cm->superres_scale_denominator);
const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
- for (row = mi_row; row < mi_row + mi_high; ++row) {
- for (col = mi_col_sr; col < mi_col_end_sr; ++col) {
+ const int step = 1 << cpi->tpl_stats_block_mis_log2;
+ for (int row = mi_row; row < mi_row + mi_high; row += step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
if (row >= cm->mi_rows || col >= mi_cols_sr) continue;
- TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
intra_cost += this_stats->intra_cost;
mc_dep_cost += this_stats->intra_cost + this_stats->mc_flow;
#if !USE_TPL_CLASSIC_MODEL
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f8ea834..1da9a6a 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2396,6 +2396,14 @@
aom_calloc(cm->mi_rows * cm->mi_cols, 1));
}
+static void set_tpl_stats_block_size(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+
+ // 0: 4x4, 1: 8x8, 2: 16x16
+ cpi->tpl_stats_block_mis_log2 = is_720p_or_larger ? 2 : 1;
+}
+
void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
CompoundTypeRdBuffers *const bufs) {
CHECK_MEM_ERROR(
@@ -2779,20 +2787,25 @@
sizeof(*cpi->ssim_rdmult_scaling_factors)));
}
+ set_tpl_stats_block_size(cpi);
for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
- int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
- int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+ const int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+ const int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+
+ cpi->tpl_stats_buffer[frame].is_valid = 0;
+ cpi->tpl_stats_buffer[frame].width =
+ mi_cols >> cpi->tpl_stats_block_mis_log2;
+ cpi->tpl_stats_buffer[frame].height =
+ mi_rows >> cpi->tpl_stats_block_mis_log2;
+ cpi->tpl_stats_buffer[frame].stride = cpi->tpl_stats_buffer[frame].width;
+ cpi->tpl_stats_buffer[frame].mi_rows = cm->mi_rows;
+ cpi->tpl_stats_buffer[frame].mi_cols = cm->mi_cols;
CHECK_MEM_ERROR(
cm, cpi->tpl_stats_buffer[frame].tpl_stats_ptr,
- aom_calloc(mi_rows * mi_cols,
+ aom_calloc(cpi->tpl_stats_buffer[frame].width *
+ cpi->tpl_stats_buffer[frame].height,
sizeof(*cpi->tpl_stats_buffer[frame].tpl_stats_ptr)));
- cpi->tpl_stats_buffer[frame].is_valid = 0;
- cpi->tpl_stats_buffer[frame].width = mi_cols;
- cpi->tpl_stats_buffer[frame].height = mi_rows;
- cpi->tpl_stats_buffer[frame].stride = mi_cols;
- cpi->tpl_stats_buffer[frame].mi_rows = cm->mi_rows;
- cpi->tpl_stats_buffer[frame].mi_cols = cm->mi_cols;
}
cpi->tpl_frame = &cpi->tpl_stats_buffer[REF_FRAMES + 1];
@@ -3639,12 +3652,13 @@
int64_t mc_saved_base = 0;
int64_t mc_count_base = 0;
#endif // !USE_TPL_CLASSIC_MODEL
- int row, col;
-
+ const int step = 1 << cpi->tpl_stats_block_mis_log2;
const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
- for (row = 0; row < cm->mi_rows; ++row) {
- for (col = 0; col < mi_cols_sr; ++col) {
- TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+
+ for (int row = 0; row < cm->mi_rows; row += step) {
+ for (int col = 0; col < mi_cols_sr; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
intra_cost_base += this_stats->intra_cost;
mc_dep_cost_base += this_stats->intra_cost + this_stats->mc_flow;
#if !USE_TPL_CLASSIC_MODEL
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index da5e001..9a7e25a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -786,6 +786,7 @@
YV12_BUFFER_CONFIG *unscaled_last_source;
YV12_BUFFER_CONFIG scaled_last_source;
+ uint8_t tpl_stats_block_mis_log2; // block granularity of tpl score storage
TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
TplDepFrame *tpl_frame;
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 53bcd17..c644c88 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -355,7 +355,13 @@
return iiratio * iiratio;
}
-static void tpl_model_update_b(TplDepFrame *tpl_frame,
+int av1_tpl_ptr_pos(AV1_COMP *cpi, int mi_row, int mi_col, int stride) {
+ const int right_shift = cpi->tpl_stats_block_mis_log2;
+
+ return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
+
+static void tpl_model_update_b(AV1_COMP *cpi, TplDepFrame *tpl_frame,
TplDepStats *tpl_stats_ptr, int mi_row,
int mi_col, double quant_ratio,
const BLOCK_SIZE bsize, int ref_frame_index,
@@ -395,12 +401,11 @@
#if !USE_TPL_CLASSIC_MODEL
int64_t mc_saved = tpl_stats_ptr->intra_cost - tpl_stats_ptr->inter_cost;
#endif // #if !USE_TPL_CLASSIC_MODEL
- int idx, idy;
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *des_stats =
- &ref_stats_ptr[(ref_mi_row + idy) * ref_tpl_frame->stride +
- (ref_mi_col + idx)];
+ const int step = 1 << cpi->tpl_stats_block_mis_log2;
+ for (int idy = 0; idy < mi_height; idy += step) {
+ for (int idx = 0; idx < mi_width; idx += step) {
+ TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+ cpi, ref_mi_row + idy, ref_mi_col + idx, ref_tpl_frame->stride)];
des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
#if !USE_TPL_CLASSIC_MODEL
des_stats->mc_count += overlap_area << TPL_DEP_COST_SCALE_LOG2;
@@ -413,30 +418,32 @@
}
}
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats_ptr,
- int mi_row, int mi_col, double quant_ratio,
- const BLOCK_SIZE bsize, int ref_frame_index,
- int_mv mv) {
- int idx, idy;
+static void tpl_model_update(AV1_COMP *cpi, TplDepFrame *tpl_frame,
+ TplDepStats *tpl_stats_ptr, int mi_row, int mi_col,
+ double quant_ratio, const BLOCK_SIZE bsize,
+ int ref_frame_index, int_mv mv) {
const int mi_height = mi_size_high[bsize];
const int mi_width = mi_size_wide[bsize];
+ const int step = 1 << cpi->tpl_stats_block_mis_log2;
+ const BLOCK_SIZE tpl_block_size =
+ convert_length_to_bsize(MI_SIZE << cpi->tpl_stats_block_mis_log2);
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *tpl_ptr =
- &tpl_stats_ptr[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
- tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
- quant_ratio, BLOCK_4X4, ref_frame_index, mv);
+ for (int idy = 0; idy < mi_height; idy += step) {
+ for (int idx = 0; idx < mi_width; idx += step) {
+ TplDepStats *tpl_ptr = &tpl_stats_ptr[av1_tpl_ptr_pos(
+ cpi, mi_row + idy, mi_col + idx, tpl_frame->stride)];
+ tpl_model_update_b(cpi, tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+ quant_ratio, tpl_block_size, ref_frame_index, mv);
}
}
}
-static void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int stride,
- const TplDepStats *src_stats) {
+static void tpl_model_store(AV1_COMP *cpi, TplDepStats *tpl_stats_ptr,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int stride, const TplDepStats *src_stats) {
const int mi_height = mi_size_high[bsize];
const int mi_width = mi_size_wide[bsize];
- int idx, idy;
+ const int step = 1 << cpi->tpl_stats_block_mis_log2;
int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
@@ -446,9 +453,10 @@
intra_cost = AOMMAX(1, intra_cost);
inter_cost = AOMMAX(1, inter_cost);
- for (idy = 0; idy < mi_height; ++idy) {
- tpl_ptr = &tpl_stats_ptr[(mi_row + idy) * stride + mi_col];
- for (idx = 0; idx < mi_width; ++idx) {
+ for (int idy = 0; idy < mi_height; idy += step) {
+ tpl_ptr =
+ &tpl_stats_ptr[av1_tpl_ptr_pos(cpi, mi_row + idy, mi_col, stride)];
+ for (int idx = 0; idx < mi_width; idx += step) {
tpl_ptr->intra_cost = intra_cost;
tpl_ptr->inter_cost = inter_cost;
tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
@@ -565,12 +573,13 @@
&ref_frame_index, &mv);
// Motion flow dependency dispenser.
- tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+ tpl_model_store(cpi, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
tpl_frame->stride, &tpl_stats);
double quant_ratio = (double)recon_error / sse;
- if (frame_idx)
- tpl_model_update(cpi->tpl_frame, tpl_frame->tpl_stats_ptr, mi_row,
+ if (frame_idx) {
+ tpl_model_update(cpi, cpi->tpl_frame, tpl_frame->tpl_stats_ptr, mi_row,
mi_col, quant_ratio, bsize, ref_frame_index, mv);
+ }
}
}
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index d089b3f..e3b1e68 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -22,6 +22,8 @@
void av1_tpl_setup_forward_stats(AV1_COMP *cpi);
+int av1_tpl_ptr_pos(AV1_COMP *cpi, int mi_row, int mi_col, int stride);
+
#ifdef __cplusplus
} // extern "C"
#endif