Fix daala-dist for cb4x4
The place where av1_daala_dist() is applied for sub8x8 partition is
moved from sub8x8 mode decision functions to rd_pick_partition().
BD-Rate change by daala-dist with '--disable-var-tx' is:
(AWCY, objective-1-fast, high delay mode)
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
15.1558 | 12.9585 | 14.4662 | -3.8651 | -1.7102 | -9.2956 | 10.8686
In MSE probe mode:
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
0.0429 | 0.0435 | 0.1651 | -0.0415 | 0.0850 | 0.0122 | 0.0546
Change-Id: I3b2ea916d41c48e433eb641adf44552e4725c198
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 602cef4..5dcd499 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -179,15 +179,15 @@
#endif
#if CONFIG_DAALA_DIST
-void av1_foreach_8x8_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+void av1_foreach_8x8_transformed_block_in_yplane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit,
foreach_transformed_block_visitor mi_visit, void *arg) {
- const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[0];
// block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
// 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
// transform size varies per plane, look it up in a common way.
- const TX_SIZE tx_size = get_tx_size(plane, xd);
+ const TX_SIZE tx_size = get_tx_size(0, xd);
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
const uint8_t txw_unit = tx_size_wide_unit[tx_size];
const uint8_t txh_unit = tx_size_high_unit[tx_size];
@@ -197,18 +197,24 @@
// If mb_to_right_edge is < 0 we are in a situation in which
// the current block size extends into the UMV and we won't
// visit the sub blocks that are wholly within the UMV.
- const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
- const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int skip_check_r = tx_size_high[tx_size] == 8 ? 1 : 0;
+ const int skip_check_c = tx_size_wide[tx_size] == 8 ? 1 : 0;
+
+ assert(plane_bsize >= BLOCK_8X8);
+ assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
// Keep track of the row and column of the blocks we use so that we know
// if we are in the unrestricted motion border.
for (r = 0; r < max_blocks_high; r += txh_unit) {
// Skip visiting the sub blocks that are wholly within the UMV.
for (c = 0; c < max_blocks_wide; c += txw_unit) {
- visit(plane, i, r, c, plane_bsize, tx_size, arg);
- // Call whenever each 8x8 block is done
- if ((r & 1) && (c & 1))
- mi_visit(plane, i, r - 1, c - 1, plane_bsize, TX_8X8, arg);
+ visit(0, i, r, c, plane_bsize, tx_size, arg);
+ // Call whenever each 8x8 tx block is done
+ if (((r & txh_unit) || skip_check_r) && ((c & txw_unit) || skip_check_c))
+ mi_visit(0, i, r - (1 - skip_check_r) * txh_unit,
+ c - (1 - skip_check_c) * txw_unit, plane_bsize, tx_size, arg);
i += step;
}
}
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 0022907..f679a60 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -310,6 +310,9 @@
int64_t rdcost;
int64_t sse;
int skip; // sse should equal to dist when skip == 1
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ int64_t dist_y;
+#endif
#if CONFIG_RD_DEBUG
int txb_coeff_cost[MAX_MB_PLANE];
#if CONFIG_VAR_TX
@@ -1164,8 +1167,8 @@
#endif
#if CONFIG_DAALA_DIST
-void av1_foreach_8x8_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+void av1_foreach_8x8_transformed_block_in_yplane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit,
foreach_transformed_block_visitor mi_visit, void *arg);
#endif
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 12dbef9..47bf4f7 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -209,8 +209,11 @@
// This is needed when using the 8x8 Daala distortion metric during RDO,
// because it evaluates distortion in a different order than the underlying
// 4x4 blocks are coded.
- int rate_4x4[256];
-#endif
+ int rate_4x4[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#if CONFIG_CB4X4
+ DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]);
+#endif // CONFIG_CB4X4
+#endif // CONFIG_DAALA_DIST
#if CONFIG_CFL
// Whether luma needs to be stored during RDO.
int cfl_store_y;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 0465ba8..0c46d03 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1281,6 +1281,29 @@
return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
}
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
+ BLOCK_SIZE bsize, int bw, int bh,
+ int mi_row, int mi_col) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = pd->dst.buf;
+
+ assert(bsize < BLOCK_8X8);
+
+ if (bsize < BLOCK_8X8) {
+ int i, j;
+ uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+
+ for (j = 0; j < bh; ++j)
+ for (i = 0; i < bw; ++i) {
+ dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
+ }
+ }
+}
+#endif
+
static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
MACROBLOCK *const x, int mi_row, int mi_col,
RD_STATS *rd_cost,
@@ -3576,9 +3599,29 @@
#if CONFIG_SUPERTX
sum_rate_nocoef += this_rate_nocoef;
#endif // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ sum_rdc.dist_y += this_rdc.dist_y;
+#endif
}
}
reached_last_index = (idx == 4);
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
+ bsize == BLOCK_8X8) {
+ int use_activity_masking = 0;
+ int64_t daala_dist;
+ const int src_stride = x->plane[0].src.stride;
+ daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride - 4,
+ src_stride, x->decoded_8x8, 8, 8, 8, 1,
+ use_activity_masking, x->qindex)
+ << 4;
+ sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+ }
+#endif // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
#if CONFIG_SUPERTX
if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
TX_SIZE supertx_size = max_txsize_lookup[bsize];
@@ -3716,6 +3759,16 @@
subsize, &pc_tree->horizontal[1],
best_rdc.rdcost - sum_rdc.rdcost);
#endif // CONFIG_SUPERTX
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+ update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
+ subsize, DRY_RUN_NORMAL);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
+ subsize, NULL);
+ }
+#endif // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
#if CONFIG_SUPERTX
@@ -3728,7 +3781,24 @@
#if CONFIG_SUPERTX
sum_rate_nocoef += this_rate_nocoef;
#endif // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ sum_rdc.dist_y += this_rdc.dist_y;
+#endif
}
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+ int use_activity_masking = 0;
+ int64_t daala_dist;
+ const int src_stride = x->plane[0].src.stride;
+ daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride,
+ src_stride, x->decoded_8x8, 8, 8, 8, 1,
+ use_activity_masking, x->qindex)
+ << 4;
+ sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+ }
+#endif // CONFIG_DAALA_DIST && CONFIG_CB4X4
}
#if CONFIG_SUPERTX
@@ -3863,6 +3933,16 @@
subsize, &pc_tree->vertical[1],
best_rdc.rdcost - sum_rdc.rdcost);
#endif // CONFIG_SUPERTX
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+ update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
+ subsize, DRY_RUN_NORMAL);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
+ subsize, NULL);
+ }
+#endif // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
#if CONFIG_SUPERTX
@@ -3875,7 +3955,24 @@
#if CONFIG_SUPERTX
sum_rate_nocoef += this_rate_nocoef;
#endif // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ sum_rdc.dist_y += this_rdc.dist_y;
+#endif
}
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+ int use_activity_masking = 0;
+ int64_t daala_dist;
+ const int src_stride = x->plane[0].src.stride;
+ daala_dist =
+ av1_daala_dist(x->plane[0].src.buf - 4, src_stride, x->decoded_8x8,
+ 8, 8, 8, 1, use_activity_masking, x->qindex)
+ << 4;
+ sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+ }
+#endif // CONFIG_DAALA_DIST && CONFIG_CB4X4
}
#if CONFIG_SUPERTX
if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
@@ -4031,6 +4128,14 @@
x->cfl_store_y = 0;
#endif
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+ bsize == BLOCK_4X4 && pc_tree->index == 3) {
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+#endif // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
if (bsize == cm->sb_size) {
#if !CONFIG_PVQ && !CONFIG_LV_MAP
assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
@@ -5517,6 +5622,13 @@
#endif
}
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) {
+ daala_dist_set_sub8x8_dst(x, x->decoded_8x8, bsize, block_size_wide[bsize],
+ block_size_high[bsize], mi_row, mi_col);
+ }
+#endif
+
if (!dry_run) {
#if CONFIG_VAR_TX
TX_SIZE tx_size =
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index d22789f..5c3eee4 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -361,6 +361,9 @@
rd_stats->rdcost = 0;
rd_stats->sse = 0;
rd_stats->skip = 1;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ rd_stats->dist_y = 0;
+#endif
#if CONFIG_RD_DEBUG
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
rd_stats->txb_coeff_cost[plane] = 0;
@@ -385,6 +388,9 @@
rd_stats->rdcost = INT64_MAX;
rd_stats->sse = INT64_MAX;
rd_stats->skip = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ rd_stats->dist_y = INT64_MAX;
+#endif
#if CONFIG_RD_DEBUG
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
rd_stats->txb_coeff_cost[plane] = INT_MAX;
@@ -409,6 +415,9 @@
rd_stats_dst->dist += rd_stats_src->dist;
rd_stats_dst->sse += rd_stats_src->sse;
rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ rd_stats_dst->dist_y += rd_stats_src->dist_y;
+#endif
#if CONFIG_RD_DEBUG
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 39cd27d..b23acce 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -575,10 +575,9 @@
return sum;
}
-static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int bsw,
- int bsh, int qm, int use_activity_masking,
- int qindex) {
+int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
+ int dst_stride, int bsw, int bsh, int qm,
+ int use_activity_masking, int qindex) {
int i, j;
int64_t d;
DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
@@ -1577,7 +1576,7 @@
rd = AOMMIN(rd1, rd2);
#if CONFIG_DAALA_DIST
- if (plane == 0 &&
+ if (plane == 0 && plane_bsize >= BLOCK_8X8 &&
(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
this_rd_stats.dist = 0;
this_rd_stats.sse = 0;
@@ -1615,6 +1614,9 @@
int use_activity_masking = 0;
(void)tx_size;
+
+ assert(plane == 0);
+ assert(plane_bsize >= BLOCK_8X8);
#if CONFIG_PVQ
use_activity_masking = x->daala_enc.use_activity_masking;
#endif // CONFIG_PVQ
@@ -1674,10 +1676,15 @@
{
const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+ int offset_h = tx_size_high_unit[TX_4X4];
// The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
- this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] +
- x->rate_4x4[block - max_blocks_wide] +
- x->rate_4x4[block - 1] + x->rate_4x4[block];
+ this_rd_stats.rate =
+ x->rate_4x4[block - max_blocks_wide * offset_h - step] +
+ x->rate_4x4[block - max_blocks_wide * offset_h] +
+ x->rate_4x4[block - step] + x->rate_4x4[block];
}
rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
@@ -1714,10 +1721,10 @@
av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
#if CONFIG_DAALA_DIST
- if (plane == 0 &&
+ if (plane == 0 && bsize >= BLOCK_8X8 &&
(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
- av1_foreach_8x8_transformed_block_in_plane(
- xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
+ av1_foreach_8x8_transformed_block_in_yplane(
+ xd, bsize, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
else
#endif // CONFIG_DAALA_DIST
av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
@@ -9498,6 +9505,9 @@
rd_cost->dist = dist_y + dist_uv;
}
rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ rd_cost->dist_y = dist_y;
+#endif
} else {
rd_cost->rate = INT_MAX;
}
@@ -10234,6 +10244,10 @@
int compmode_cost = 0;
int rate2 = 0, rate_y = 0, rate_uv = 0;
int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ int64_t distortion2_y = 0;
+ int64_t total_sse_y = INT64_MAX;
+#endif
int skippable = 0;
int this_skip2 = 0;
int64_t total_sse = INT64_MAX;
@@ -10575,6 +10589,9 @@
if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
+#endif
} else {
int_mv backup_ref_mv[2];
@@ -10668,6 +10685,9 @@
total_sse = rd_stats.sse;
rate_y = rd_stats_y.rate;
rate_uv = rd_stats_uv.rate;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) distortion2_y = rd_stats_y.dist;
+#endif
}
// TODO(jingning): This needs some refactoring to improve code quality
@@ -10877,6 +10897,12 @@
tmp_ref_rd = tmp_alt_rd;
backup_mbmi = *mbmi;
backup_skip = x->skip;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) {
+ total_sse_y = tmp_rd_stats_y.sse;
+ distortion2_y = tmp_rd_stats_y.dist;
+ }
+#endif
#if CONFIG_VAR_TX
for (i = 0; i < MAX_MB_PLANE; ++i)
memcpy(x->blk_skip_drl[i], x->blk_skip[i],
@@ -10950,6 +10976,9 @@
this_skip2 = 1;
rate_y = 0;
rate_uv = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) distortion2_y = total_sse_y;
+#endif
}
} else {
// Add in the cost of the no skip flag.
@@ -11039,7 +11068,9 @@
best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
this_skip2 || skippable);
best_rate_uv = rate_uv;
-
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2_y;
+#endif
#if CONFIG_VAR_TX
for (i = 0; i < MAX_MB_PLANE; ++i)
memcpy(ctx->blk_skip[i], x->blk_skip[i],
@@ -11167,6 +11198,9 @@
rd_cost->rate +=
(rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) rd_cost->dist_y = rd_stats_y.dist;
+#endif
rd_cost->rdcost =
RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
best_skip2 = skip_blk;
@@ -11686,7 +11720,9 @@
rd_cost->rate = rate2;
rd_cost->dist = distortion2;
rd_cost->rdcost = this_rd;
-
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
+#endif
if (this_rd >= best_rd_so_far) {
rd_cost->rate = INT_MAX;
rd_cost->rdcost = INT64_MAX;
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index a7053b2..6563773 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -62,6 +62,12 @@
TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
OUTPUT_STATUS output_status);
+#if CONFIG_DAALA_DIST
+int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
+ int dst_stride, int bsw, int bsh, int qm,
+ int use_activity_masking, int qindex);
+#endif
+
#if !CONFIG_PVQ || CONFIG_VAR_TX
int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,