AV1 RT: Move VBP data allocation to the thread level
Move VBP data structure allocation from per-SB base to per thread.
This gives ~3-4% speedup on lowres speed8 on low-end ARM device where
memory allocation and de-allocation are heavy. No impact on x86
Change-Id: Iabce1a4976cf47dcb6c65819bf71f843bc19a104
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 3e5f966..a3f3a81 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4429,7 +4429,7 @@
set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
} else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
- av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
}
assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
cpi->partition_search_skippable_frame ||
@@ -4706,7 +4706,7 @@
if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
- av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
&dummy_rate, &dummy_dist, 1, pc_root);
}
@@ -4967,6 +4967,7 @@
CHECK_MEM_ERROR(
cm, cpi->tile_data,
aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+
cpi->allocated_tiles = tile_cols * tile_rows;
}
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 89c8302..dd2ebab 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -866,6 +866,11 @@
aom_free(cpi->td.mb.mbmi_ext);
cpi->td.mb.mbmi_ext = NULL;
+ if (cpi->td.vt64x64) {
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
+ }
+
av1_free_ref_frame_buffers(cm->buffer_pool);
av1_free_txb_buf(cpi);
av1_free_context_buffers(cm);
@@ -3604,6 +3609,7 @@
aom_free(thread_data->td->above_pred_buf);
aom_free(thread_data->td->left_pred_buf);
aom_free(thread_data->td->wsrc_buf);
+ aom_free(thread_data->td->vt64x64);
aom_free(thread_data->td->inter_modes_info);
for (int x = 0; x < 2; x++) {
@@ -5396,6 +5402,21 @@
set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
q_low = bottom_index;
q_high = top_index;
+ if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ const int num_64x64_blocks =
+ (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+ if (cpi->td.vt64x64) {
+ if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
+ }
+ }
+ if (!cpi->td.vt64x64) {
+ CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+ aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+ cpi->td.num_64x64_blocks = num_64x64_blocks;
+ }
+ }
if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats &&
cm->current_frame.frame_type == KEY_FRAME) {
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 63c3b38..aa40799 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -575,6 +575,55 @@
int num_threads_working;
} AV1RowMTInfo;
+typedef struct {
+ // TODO(kyslov): consider changing to 64bit
+
+ // This struct is used for computing variance in choose_partitioning(), where
+ // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+ // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+ // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+ uint32_t sum_square_error;
+ int32_t sum_error;
+ int log2_count;
+ int variance;
+} VPartVar;
+
+typedef struct {
+ VPartVar none;
+ VPartVar horz[2];
+ VPartVar vert[2];
+} VPVariance;
+
+typedef struct {
+ VPVariance part_variances;
+ VPartVar split[4];
+} VP4x4;
+
+typedef struct {
+ VPVariance part_variances;
+ VP4x4 split[4];
+} VP8x8;
+
+typedef struct {
+ VPVariance part_variances;
+ VP8x8 split[4];
+} VP16x16;
+
+typedef struct {
+ VPVariance part_variances;
+ VP16x16 split[4];
+} VP32x32;
+
+typedef struct {
+ VPVariance part_variances;
+ VP32x32 split[4];
+} VP64x64;
+
+typedef struct {
+ VPVariance part_variances;
+ VP64x64 *split;
+} VP128x128;
+
// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
typedef struct TileDataEnc {
TileInfo tile_info;
@@ -634,6 +683,8 @@
int deltaq_used;
FRAME_CONTEXT *tctx;
MB_MODE_INFO_EXT *mbmi_ext;
+ VP64x64 *vt64x64;
+ int32_t num_64x64_blocks;
} ThreadData;
struct EncWorkerData;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 3682b68..82220ff 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -483,6 +483,14 @@
cm, thread_data->td->mbmi_ext,
aom_calloc(sb_mi_size, sizeof(*thread_data->td->mbmi_ext)));
+ if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ const int num_64x64_blocks =
+ (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->vt64x64,
+ aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
+ }
+
// Create threads
if (!winterface->reset(worker))
aom_internal_error(&cm->error, AOM_CODEC_ERROR,
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index d9e4d1a..b0fb2f0 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -34,57 +34,8 @@
extern const uint8_t AV1_VAR_OFFS[];
typedef struct {
- // TODO(kyslov): consider changing to 64bit
-
- // This struct is used for computing variance in choose_partitioning(), where
- // the max number of samples within a superblock is 32x32 (with 4x4 avg).
- // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
- // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
- uint32_t sum_square_error;
- int32_t sum_error;
- int log2_count;
- int variance;
-} var;
-
-typedef struct {
- var none;
- var horz[2];
- var vert[2];
-} partition_variance;
-
-typedef struct {
- partition_variance part_variances;
- var split[4];
-} v4x4;
-
-typedef struct {
- partition_variance part_variances;
- v4x4 split[4];
-} v8x8;
-
-typedef struct {
- partition_variance part_variances;
- v8x8 split[4];
-} v16x16;
-
-typedef struct {
- partition_variance part_variances;
- v16x16 split[4];
-} v32x32;
-
-typedef struct {
- partition_variance part_variances;
- v32x32 split[4];
-} v64x64;
-
-typedef struct {
- partition_variance part_variances;
- v64x64 *split;
-} v128x128;
-
-typedef struct {
- partition_variance *part_variances;
- var *split[4];
+ VPVariance *part_variances;
+ VPartVar *split[4];
} variance_node;
static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
@@ -93,42 +44,42 @@
node->part_variances = NULL;
switch (bsize) {
case BLOCK_128X128: {
- v128x128 *vt = (v128x128 *)data;
+ VP128x128 *vt = (VP128x128 *)data;
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++)
node->split[i] = &vt->split[i].part_variances.none;
break;
}
case BLOCK_64X64: {
- v64x64 *vt = (v64x64 *)data;
+ VP64x64 *vt = (VP64x64 *)data;
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++)
node->split[i] = &vt->split[i].part_variances.none;
break;
}
case BLOCK_32X32: {
- v32x32 *vt = (v32x32 *)data;
+ VP32x32 *vt = (VP32x32 *)data;
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++)
node->split[i] = &vt->split[i].part_variances.none;
break;
}
case BLOCK_16X16: {
- v16x16 *vt = (v16x16 *)data;
+ VP16x16 *vt = (VP16x16 *)data;
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++)
node->split[i] = &vt->split[i].part_variances.none;
break;
}
case BLOCK_8X8: {
- v8x8 *vt = (v8x8 *)data;
+ VP8x8 *vt = (VP8x8 *)data;
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++)
node->split[i] = &vt->split[i].part_variances.none;
break;
}
default: {
- v4x4 *vt = (v4x4 *)data;
+ VP4x4 *vt = (VP4x4 *)data;
assert(bsize == BLOCK_4X4);
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
@@ -138,13 +89,14 @@
}
// Set variance values given sum square error, sum error, count.
-static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
+ VPartVar *v) {
v->sum_square_error = s2;
v->sum_error = s;
v->log2_count = c;
}
-static AOM_INLINE void get_variance(var *v) {
+static AOM_INLINE void get_variance(VPartVar *v) {
v->variance =
(int)(256 * (v->sum_square_error -
(uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -152,7 +104,8 @@
v->log2_count);
}
-static AOM_INLINE void sum_2_variances(const var *a, const var *b, var *r) {
+static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
+ VPartVar *r) {
assert(a->log2_count == b->log2_count);
fill_variance(a->sum_square_error + b->sum_square_error,
a->sum_error + b->sum_error, a->log2_count + 1, r);
@@ -263,7 +216,7 @@
static AOM_INLINE void fill_variance_8x8avg(const uint8_t *s, int sp,
const uint8_t *d, int dp,
int x16_idx, int y16_idx,
- v16x16 *vst,
+ VP16x16 *vst,
#if CONFIG_AV1_HIGHBITDEPTH
int highbd_flag,
#endif
@@ -335,7 +288,7 @@
static AOM_INLINE void fill_variance_4x4avg(const uint8_t *s, int sp,
const uint8_t *d, int dp,
- int x8_idx, int y8_idx, v8x8 *vst,
+ int x8_idx, int y8_idx, VP8x8 *vst,
#if CONFIG_AV1_HIGHBITDEPTH
int highbd_flag,
#endif
@@ -454,8 +407,8 @@
// Set temporal variance low flag for superblock 64x64.
// Only first 25 in the array are used in this case.
static AOM_INLINE void set_low_temp_var_flag_64x64(
- CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd, v64x64 *vt,
- const int64_t thresholds[], int mi_col, int mi_row) {
+ CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
+ VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
if (xd->mi[0]->sb_type == BLOCK_64X64) {
if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
x->variance_low[0] = 1;
@@ -505,7 +458,7 @@
static AOM_INLINE void set_low_temp_var_flag_128x128(
CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
- v128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
+ VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
if (xd->mi[0]->sb_type == BLOCK_128X128) {
if (vt->part_variances.none.variance < (thresholds[0] >> 1))
x->variance_low[0] = 1;
@@ -582,7 +535,7 @@
}
static AOM_INLINE void set_low_temp_var_flag(
- AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, v128x128 *vt,
+ AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, VP128x128 *vt,
int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col,
int mi_row) {
AV1_COMMON *const cm = &cpi->common;
@@ -672,13 +625,14 @@
// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
// selection and most of all - retune the thresholds
int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
- MACROBLOCK *x, int mi_row, int mi_col) {
+ ThreadData *td, MACROBLOCK *x, int mi_row,
+ int mi_col) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
int i, j, k, m;
- v128x128 *vt;
- v16x16 *vt2 = NULL;
+ VP128x128 *vt;
+ VP16x16 *vt2 = NULL;
unsigned char force_split[85];
int avg_32x32;
int max_var_32x32[4];
@@ -708,14 +662,13 @@
unsigned int y_sad = UINT_MAX;
unsigned int y_sad_g = UINT_MAX;
BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
- v64x64 *vt64x64 = NULL;
// Ref frame used in partitioning.
MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
- CHECK_MEM_ERROR(cm, vt64x64, aom_malloc(sizeof(*vt64x64) * num_64x64_blocks));
CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
- vt->split = vt64x64;
+
+ vt->split = td->vt64x64;
int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
cpi->vbp_thresholds[2], cpi->vbp_thresholds[3],
@@ -866,7 +819,7 @@
const int x16_idx = x32_idx + ((j & 1) << 4);
const int y16_idx = y32_idx + ((j >> 1) << 4);
const int split_index = 21 + i2 + j;
- v16x16 *vst = &vt->split[m].split[i].split[j];
+ VP16x16 *vst = &vt->split[m].split[i].split[j];
force_split[split_index] = 0;
variance4x4downsample[i2 + j] = 0;
if (!is_key_frame) {
@@ -926,7 +879,7 @@
for (k = 0; k < 4; k++) {
int x8_idx = x16_idx + ((k & 1) << 3);
int y8_idx = y16_idx + ((k >> 1) << 3);
- v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+ VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
#if CONFIG_AV1_HIGHBITDEPTH
xd->cur_buf->flags,
@@ -947,7 +900,7 @@
for (j = 0; j < 4; j++) {
const int split_index = 21 + i2 + j;
if (variance4x4downsample[i2 + j] == 1) {
- v16x16 *vtemp =
+ VP16x16 *vtemp =
(!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j];
for (k = 0; k < 4; k++)
fill_variance_tree(&vtemp->split[k], BLOCK_8X8);
@@ -1050,7 +1003,7 @@
// For inter frames: if variance4x4downsample[] == 1 for this
// 16x16 block, then the variance is based on 4x4 down-sampling,
// so use vt2 in set_vt_partioning(), otherwise use vt.
- v16x16 *vtemp =
+ VP16x16 *vtemp =
(!is_key_frame && variance4x4downsample[i2 + j] == 1)
? &vt2[i2 + j]
: &vt->split[m].split[i].split[j];
@@ -1082,7 +1035,6 @@
}
chroma_check(cpi, x, bsize, y_sad, is_key_frame);
- if (vt64x64) aom_free(vt64x64);
if (vt2) aom_free(vt2);
if (vt) aom_free(vt);
return 0;
diff --git a/av1/encoder/var_based_part.h b/av1/encoder/var_based_part.h
index a2b6532..a80e25c 100644
--- a/av1/encoder/var_based_part.h
+++ b/av1/encoder/var_based_part.h
@@ -35,7 +35,8 @@
int content_state);
int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
- MACROBLOCK *x, int mi_row, int mi_col);
+ ThreadData *td, MACROBLOCK *x, int mi_row,
+ int mi_col);
#ifdef __cplusplus
} // extern "C"