AV1 RT: Dual filter for nonRD in RT mode
As we don't use 4x4 transforms in nonRD RT for luma we can skip every
second LPF setup.
2-3% overall speed up on 36-p speed 9
Change-Id: If9a0f8a576cd9cbf912fffca6307e4cf180ecc1e
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index fbee8da..d316e45 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -462,6 +462,82 @@
}
}
+void av1_filter_block_plane_vert_rt(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row,
+ const uint32_t mi_col) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int plane_mi_rows =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+ const int plane_mi_cols =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+ const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+ (MAX_MIB_SIZE >> scale_vert));
+ const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+ (MAX_MIB_SIZE >> scale_horz));
+ for (int y = 0; y < y_range; y += 2) {
+ uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+ for (int x = 0; x < x_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+ // If 4x4 transform is used, it will then filter the internal edge
+ // aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(¶ms, 0, sizeof(params));
+
+ tx_size =
+ set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd,
+ VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ switch (params.filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ assert(plane != 0);
+ aom_lpf_vertical_6_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_vertical_8_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_vertical_14_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ // advance the destination pointer
+ advance_units = tx_size_wide_unit[tx_size];
+ x += advance_units;
+ p += advance_units * MI_SIZE;
+ }
+ }
+}
+
void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
const MACROBLOCKD *const xd, const int plane,
const MACROBLOCKD_PLANE *const plane_ptr,
@@ -584,6 +660,83 @@
}
}
+void av1_filter_block_plane_horz_rt(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row,
+ const uint32_t mi_col) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int plane_mi_rows =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+ const int plane_mi_cols =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+ const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+ (MAX_MIB_SIZE >> scale_vert));
+ const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+ (MAX_MIB_SIZE >> scale_horz));
+ for (int x = 0; x < x_range; x += 2) {
+ uint8_t *p = dst_ptr + x * MI_SIZE;
+ for (int y = 0; y < y_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will first filter the vertical edge aligned with a 8x8
+ // block. If 4x4 transform is used, it will then filter the internal
+ // edge aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(¶ms, 0, sizeof(params));
+
+ tx_size = set_lpf_parameters(
+ ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+ curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ switch (params.filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 6-tap filtering
+ case 6:
+ assert(plane != 0);
+ aom_lpf_horizontal_6_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_horizontal_8_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_horizontal_14_dual(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ // advance the destination pointer
+ advance_units = tx_size_high_unit[tx_size];
+ y += advance_units;
+ p += advance_units * dst_stride * MI_SIZE;
+ }
+ }
+}
+
void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
const MACROBLOCKD *const xd,
const int plane,
@@ -673,7 +826,7 @@
#if CONFIG_LPF_MASK
int is_decoding,
#endif
- int plane_start, int plane_end) {
+ int plane_start, int plane_end, int is_realtime) {
struct macroblockd_plane *pd = xd->plane;
const int col_start = 0;
const int col_end = cm->mi_params.mi_cols;
@@ -728,7 +881,6 @@
continue;
else if (plane == 2 && !(cm->lf.filter_level_v))
continue;
-
if (cm->lf.combine_vert_horz_lf) {
// filter all vertical and horizontal edges in every 128x128 super block
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
@@ -736,22 +888,57 @@
// filter vertical edges
av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
mi_col);
+#else
+ if (is_realtime && !plane) {
+ av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+
+ } else {
+ av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ }
+#endif
// filter horizontal edges
if (mi_col - MAX_MIB_SIZE >= 0) {
av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col - MAX_MIB_SIZE, plane,
plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
mi_col - MAX_MIB_SIZE);
+#else
+ if (is_realtime && !plane) {
+ av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+ mi_col - MAX_MIB_SIZE);
+ } else {
+ av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+ mi_col - MAX_MIB_SIZE);
+ }
+#endif
}
}
// filter horizontal edges
av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, mi_row,
mi_col - MAX_MIB_SIZE, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
mi_col - MAX_MIB_SIZE);
+#else
+ if (is_realtime && !plane) {
+ av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+ mi_col - MAX_MIB_SIZE);
+
+ } else {
+ av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+ mi_col - MAX_MIB_SIZE);
+ }
+#endif
}
} else {
// filter all vertical edges in every 128x128 super block
@@ -759,8 +946,19 @@
for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
mi_col);
+#else
+ if (is_realtime && !plane) {
+ av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ } else {
+ av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ }
+#endif
}
}
@@ -769,8 +967,20 @@
for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
mi_col);
+#else
+ if (is_realtime && !plane) {
+ av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+
+ } else {
+ av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ }
+#endif
}
}
}
@@ -782,7 +992,8 @@
#if CONFIG_LPF_MASK
int is_decoding,
#endif
- int plane_start, int plane_end, int partial_frame) {
+ int plane_start, int plane_end, int partial_frame,
+ int is_realtime) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
start_mi_row = 0;
@@ -798,5 +1009,5 @@
#if CONFIG_LPF_MASK
is_decoding,
#endif
- plane_start, plane_end);
+ plane_start, plane_end, is_realtime);
}
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index ca16bbe..ed4453b 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -151,7 +151,7 @@
#else
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
struct macroblockd *xd, int plane_start,
- int plane_end, int partial_frame);
+ int plane_end, int partial_frame, int is_realtime);
#endif
void av1_filter_block_plane_vert(const struct AV1Common *const cm,
@@ -164,6 +164,20 @@
const MACROBLOCKD_PLANE *const plane_ptr,
const uint32_t mi_row, const uint32_t mi_col);
+void av1_filter_block_plane_vert_rt(const struct AV1Common *const cm,
+ const MACROBLOCKD *const xd,
+ const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row,
+ const uint32_t mi_col);
+
+void av1_filter_block_plane_horz_rt(const struct AV1Common *const cm,
+ const MACROBLOCKD *const xd,
+ const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row,
+ const uint32_t mi_col);
+
uint8_t av1_get_filter_level(const struct AV1Common *cm,
const loop_filter_info_n *lfi_n, const int dir_idx,
int plane, const MB_MODE_INFO *mbmi);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index ef23450..0c45749 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -266,7 +266,7 @@
#if CONFIG_LPF_MASK
int is_decoding,
#endif
- int plane_start, int plane_end) {
+ int plane_start, int plane_end, int is_realtime) {
int mi_row, plane, dir;
AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
lf_sync->jobs_enqueued = 0;
@@ -293,6 +293,7 @@
lf_job_queue->mi_row = mi_row;
lf_job_queue->plane = plane;
lf_job_queue->dir = dir;
+ lf_job_queue->is_realtime = is_realtime;
lf_job_queue++;
lf_sync->jobs_enqueued++;
}
@@ -327,7 +328,7 @@
const int sb_cols =
ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
MAX_MIB_SIZE_LOG2;
- int mi_row, mi_col, plane, dir;
+ int mi_row, mi_col, plane, dir, is_realtime;
int r, c;
while (1) {
@@ -338,6 +339,7 @@
plane = cur_job_info->plane;
dir = cur_job_info->dir;
r = mi_row >> MAX_MIB_SIZE_LOG2;
+ is_realtime = cur_job_info->is_realtime && !plane;
if (dir == 0) {
for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
@@ -346,9 +348,20 @@
av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col, plane, plane + 1);
-
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
mi_col);
+#else
+ if (is_realtime) {
+ av1_filter_block_plane_vert_rt(cm, xd, plane, &planes[plane],
+ mi_row, mi_col);
+
+ } else {
+ av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+ mi_col);
+ }
+#endif
sync_write(lf_sync, r, c, sb_cols, plane);
}
} else if (dir == 1) {
@@ -366,8 +379,19 @@
av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+ (void)is_realtime;
av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
mi_col);
+#else
+ if (is_realtime) {
+ av1_filter_block_plane_horz_rt(cm, xd, plane, &planes[plane],
+ mi_row, mi_col);
+ } else {
+ av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+ mi_col);
+ }
+#endif
}
}
} else {
@@ -460,7 +484,7 @@
int is_decoding,
#endif
AVxWorker *workers, int nworkers,
- AV1LfSync *lf_sync) {
+ AV1LfSync *lf_sync, int is_realtime) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
#if CONFIG_LPF_MASK
int sb_rows;
@@ -496,7 +520,7 @@
#if CONFIG_LPF_MASK
is_decoding,
#endif
- plane_start, plane_end);
+ plane_start, plane_end, is_realtime);
// Set up loopfilter thread data.
for (i = num_workers - 1; i >= 0; --i) {
@@ -539,7 +563,7 @@
int is_decoding,
#endif
AVxWorker *workers, int num_workers,
- AV1LfSync *lf_sync) {
+ AV1LfSync *lf_sync, int is_realtime) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
start_mi_row = 0;
@@ -581,7 +605,7 @@
}
#else
loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
- plane_end, workers, num_workers, lf_sync);
+ plane_end, workers, num_workers, lf_sync, is_realtime);
#endif
}
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index b04f4fd..bcb4b87 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -28,6 +28,7 @@
int mi_row;
int plane;
int dir;
+ int is_realtime;
} AV1LfMTInfo;
// Loopfilter row synchronization
@@ -157,7 +158,7 @@
int is_decoding,
#endif
AVxWorker *workers, int num_workers,
- AV1LfSync *lf_sync);
+ AV1LfSync *lf_sync, int is_realtime);
#if !CONFIG_REALTIME_ONLY
void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index c4000bd..b5818f5 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5293,13 +5293,13 @@
#if CONFIG_LPF_MASK
1,
#endif
- pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
+ pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync, 0);
} else {
av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->dcb.xd,
#if CONFIG_LPF_MASK
1,
#endif
- 0, num_planes, 0);
+ 0, num_planes, 0, 0);
}
}
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 10bb51d..30bca96 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2193,6 +2193,7 @@
const int use_restoration = cm->seq_params->enable_restoration &&
!cm->features.all_lossless &&
!cm->tiles.large_scale;
+ const int is_realtime = cpi->sf.rt_sf.use_nonrd_pick_mode;
struct loopfilter *lf = &cm->lf;
@@ -2215,13 +2216,13 @@
0,
#endif
mt_info->workers, num_workers,
- &mt_info->lf_row_sync);
+ &mt_info->lf_row_sync, is_realtime);
else
av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
#if CONFIG_LPF_MASK
0,
#endif
- 0, num_planes, 0);
+ 0, num_planes, 0, is_realtime);
}
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, loop_filter_time);
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index f2168c4..9858e20 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -78,13 +78,13 @@
0,
#endif
mt_info->workers, num_workers,
- &mt_info->lf_row_sync);
+ &mt_info->lf_row_sync, 0);
else
av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
#if CONFIG_LPF_MASK
0,
#endif
- plane, plane + 1, partial_frame);
+ plane, plane + 1, partial_frame, 0);
filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
cm->seq_params->use_highbitdepth);