Merge "Add parallel-deblocking experiment" into nextgenv2
diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index b029cb4..dec5514 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -1195,9 +1195,10 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-void av1_filter_block_plane_non420(AV1_COMMON *cm,
- struct macroblockd_plane *plane,
- MODE_INFO **mib, int mi_row, int mi_col) {
+void av1_filter_block_plane_non420_ver(AV1_COMMON *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mib, int mi_row,
+ int mi_col) {
const int ss_x = plane->subsampling_x;
const int ss_y = plane->subsampling_y;
const int row_step = 1 << ss_y;
@@ -1381,6 +1382,22 @@
// Now do horizontal pass
dst->buf = dst0;
+}
+
+void av1_filter_block_plane_non420_hor(AV1_COMMON *cm,
+ struct macroblockd_plane *plane,
+ int mi_row) {
+ const int ss_y = plane->subsampling_y;
+ const int row_step = 1 << ss_y;
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 };
+ unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
+ unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
+ unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
+ uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
+ int r;
+
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
@@ -1416,11 +1433,12 @@
#endif // CONFIG_AOM_HIGHBITDEPTH
dst->buf += MI_SIZE * dst->stride;
}
+ dst->buf = dst0;
}
-void av1_filter_block_plane_ss00(AV1_COMMON *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
struct buf_2d *const dst = &plane->dst;
uint8_t *const dst0 = dst->buf;
int r;
@@ -1464,10 +1482,20 @@
// Horizontal pass
dst->buf = dst0;
- mask_16x16 = lfm->above_y[TX_16X16];
- mask_8x8 = lfm->above_y[TX_8X8];
- mask_4x4 = lfm->above_y[TX_4X4];
- mask_4x4_int = lfm->int_4x4_y;
+}
+
+void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ int r;
+ uint64_t mask_16x16 = lfm->above_y[TX_16X16];
+ uint64_t mask_8x8 = lfm->above_y[TX_8X8];
+ uint64_t mask_4x4 = lfm->above_y[TX_4X4];
+ uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+ assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
unsigned int mask_16x16_r;
@@ -1507,11 +1535,13 @@
mask_4x4 >>= MI_SIZE;
mask_4x4_int >>= MI_SIZE;
}
+ // restore the buf pointer in case there is additional filter pass.
+ dst->buf = dst0;
}
-void av1_filter_block_plane_ss11(AV1_COMMON *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
struct buf_2d *const dst = &plane->dst;
uint8_t *const dst0 = dst->buf;
int r, c;
@@ -1566,10 +1596,20 @@
// Horizontal pass
dst->buf = dst0;
- mask_16x16 = lfm->above_uv[TX_16X16];
- mask_8x8 = lfm->above_uv[TX_8X8];
- mask_4x4 = lfm->above_uv[TX_4X4];
- mask_4x4_int = lfm->above_int_4x4_uv;
+}
+
+void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ int r;
+ uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
+ uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
+ uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
+ uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
+
+ assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
@@ -1612,6 +1652,8 @@
mask_4x4 >>= MI_SIZE / 2;
mask_4x4_int >>= MI_SIZE / 2;
}
+ // restore the buf pointer in case there is additional filter pass.
+ dst->buf = dst0;
}
void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
@@ -1634,12 +1676,14 @@
av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
- for (plane = 0; plane < num_planes; ++plane)
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
- mi_col);
+ for (plane = 0; plane < num_planes; ++plane) {
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+ }
}
}
-#else
+#else // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
enum lf_path path;
@@ -1653,7 +1697,7 @@
path = LF_PATH_444;
else
path = LF_PATH_SLOW;
-
+#if CONFIG_PARALLEL_DEBLOCKING
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
@@ -1664,23 +1708,83 @@
// TODO(JBB): Make setup_mask work for non 420.
av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
- av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+ av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
for (plane = 1; plane < num_planes; ++plane) {
switch (path) {
case LF_PATH_420:
- av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_444:
- av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_SLOW:
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
- mi_row, mi_col);
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
break;
}
}
}
}
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ int plane;
+
+ av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+ // TODO(JBB): Make setup_mask work for non 420.
+ av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+ av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+ for (plane = 1; plane < num_planes; ++plane) {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+ break;
+ }
+ }
+ }
+ }
+#else // CONFIG_PARALLEL_DEBLOCKING
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ int plane;
+
+ av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+ // TODO(JBB): Make setup_mask work for non 420.
+ av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+ av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
+ av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+ for (plane = 1; plane < num_planes; ++plane) {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+
+ break;
+ }
+ }
+ }
+ }
+#endif // CONFIG_PARALLEL_DEBLOCKING
#endif // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
}
diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h
index 975cbdf..cdc2512 100644
--- a/av1/common/loopfilter.h
+++ b/av1/common/loopfilter.h
@@ -99,17 +99,26 @@
const int mi_col, MODE_INFO **mi_8x8,
const int mode_info_stride, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss00(struct AV1Common *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss11(struct AV1Common *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_non420(struct AV1Common *cm,
- struct macroblockd_plane *plane,
- MODE_INFO **mi_8x8, int mi_row, int mi_col);
+void av1_filter_block_plane_non420_ver(struct AV1Common *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mi_8x8, int mi_row,
+ int mi_col);
+void av1_filter_block_plane_non420_hor(struct AV1Common *cm,
+ struct macroblockd_plane *plane,
+ int mi_row);
void av1_loop_filter_init(struct AV1Common *cm);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index eeaeb21..1100671 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -85,25 +85,153 @@
#endif // CONFIG_MULTITHREAD
}
-// Implement row loopfiltering for each thread.
-static INLINE void thread_loop_filter_rows(
- const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
- struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop,
- int y_only, AV1LfSync *const lf_sync) {
- const int num_planes = y_only ? 1 : MAX_MB_PLANE;
- const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+static INLINE enum lf_path get_loop_filter_path(
+ int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) {
+ if (y_only)
+ return LF_PATH_444;
+ else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+ return LF_PATH_420;
+ else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+ return LF_PATH_444;
+ else
+ return LF_PATH_SLOW;
+}
+
+static INLINE void loop_filter_block_plane_ver(
+ AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+ MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+ LOOP_FILTER_MASK *lfm) {
+ if (plane == 0) {
+ av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
+ } else {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ break;
+ }
+ }
+}
+
+static INLINE void loop_filter_block_plane_hor(
+ AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+ int mi_row, enum lf_path path, LOOP_FILTER_MASK *lfm) {
+ if (plane == 0) {
+ av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
+ } else {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+ break;
+ }
+ }
+}
+#endif
+// Row-based multi-threaded loopfilter hook
+#if CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
#if !CONFIG_EXT_PARTITION_TYPES
- enum lf_path path;
- LOOP_FILTER_MASK lfm;
- if (y_only)
- path = LF_PATH_444;
- else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
- path = LF_PATH_420;
- else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
- path = LF_PATH_444;
- else
- path = LF_PATH_SLOW;
+ enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+ for (mi_row = lf_data->start; mi_row < lf_data->stop;
+ mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+ MODE_INFO **const mi =
+ lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+ for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+ mi_col += lf_data->cm->mib_size) {
+ LOOP_FILTER_MASK lfm;
+ int plane;
+
+ av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+ mi_col);
+ av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+ lf_data->cm->mi_stride, &lfm);
+
+#if CONFIG_EXT_PARTITION_TYPES
+ for (plane = 0; plane < num_planes; ++plane)
+ av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+ mi + mi_col, mi_row, mi_col);
+#else
+
+ for (plane = 0; plane < num_planes; ++plane)
+ loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi,
+ mi_row, mi_col, path, &lfm);
+#endif
+ }
+ }
+ return 1;
+}
+
+static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+ const int sb_cols =
+ mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+ int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+ enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+
+ for (mi_row = lf_data->start; mi_row < lf_data->stop;
+ mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+ MODE_INFO **const mi =
+ lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+ for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+ mi_col += lf_data->cm->mib_size) {
+ const int r = mi_row >> lf_data->cm->mib_size_log2;
+ const int c = mi_col >> lf_data->cm->mib_size_log2;
+ LOOP_FILTER_MASK lfm;
+ int plane;
+
+ // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
+ // the outer loop to column-based and remove the synchronizations here.
+ sync_read(lf_sync, r, c);
+
+ av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+ mi_col);
+ av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+ lf_data->cm->mi_stride, &lfm);
+#if CONFIG_EXT_PARTITION_TYPES
+ for (plane = 0; plane < num_planes; ++plane)
+ av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+ mi_row);
+#else
+ for (plane = 0; plane < num_planes; ++plane)
+ loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row,
+ path, &lfm);
+#endif
+ sync_write(lf_sync, r, c, sb_cols);
+ }
+ }
+ return 1;
+}
+#else // CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_row_worker(AV1LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+ const int sb_cols =
+ mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+ int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+ enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
#endif // !CONFIG_EXT_PARTITION_TYPES
#if CONFIG_EXT_PARTITION
@@ -113,56 +241,48 @@
exit(EXIT_FAILURE);
#endif // CONFIG_EXT_PARTITION
- for (mi_row = start; mi_row < stop;
- mi_row += lf_sync->num_workers * cm->mib_size) {
- MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ for (mi_row = lf_data->start; mi_row < lf_data->stop;
+ mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+ MODE_INFO **const mi =
+ lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
- const int r = mi_row >> cm->mib_size_log2;
- const int c = mi_col >> cm->mib_size_log2;
+ for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+ mi_col += lf_data->cm->mib_size) {
+ const int r = mi_row >> lf_data->cm->mib_size_log2;
+ const int c = mi_col >> lf_data->cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+ LOOP_FILTER_MASK lfm;
+#endif
int plane;
sync_read(lf_sync, r, c);
- av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-
+ av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+ mi_col);
#if CONFIG_EXT_PARTITION_TYPES
- for (plane = 0; plane < num_planes; ++plane)
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
- mi_col);
+ for (plane = 0; plane < num_planes; ++plane) {
+ av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+ mi + mi_col, mi_row, mi_col);
+ av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+ mi_row);
+ }
#else
- // TODO(JBB): Make setup_mask work for non 420.
- av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+ av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+ lf_data->cm->mi_stride, &lfm);
- av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
- for (plane = 1; plane < num_planes; ++plane) {
- switch (path) {
- case LF_PATH_420:
- av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
- break;
- case LF_PATH_444:
- av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
- break;
- case LF_PATH_SLOW:
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
- mi_row, mi_col);
- break;
- }
+ for (plane = 0; plane < num_planes; ++plane) {
+ loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi,
+ mi_row, mi_col, path, &lfm);
+ loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row,
+ path, &lfm);
}
#endif // CONFIG_EXT_PARTITION_TYPES
sync_write(lf_sync, r, c, sb_cols);
}
}
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
- LFWorkerData *const lf_data) {
- thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
- lf_data->start, lf_data->stop, lf_data->y_only,
- lf_sync);
return 1;
}
+#endif // CONFIG_PARALLEL_DEBLOCKING
static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -191,17 +311,79 @@
av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
}
+// Set up loopfilter thread data.
+// The decoder is capping num_workers because it has been observed that using
+// more threads on the loopfilter than there are cores will hurt performance
+// on Android. This is because the system will only schedule the tile decode
+// workers on cores equal to the number of tile columns. Then if the decoder
+// tries to use more threads for the loopfilter, it will hurt performance
+// because of contention. If the multithreading code changes in the future
+// then the number of workers used by the loopfilter should be revisited.
+
+#if CONFIG_PARALLEL_DEBLOCKING
// Initialize cur_sb_col to -1 for all SB rows.
memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
- // Set up loopfilter thread data.
- // The decoder is capping num_workers because it has been observed that using
- // more threads on the loopfilter than there are cores will hurt performance
- // on Android. This is because the system will only schedule the tile decode
- // workers on cores equal to the number of tile columns. Then if the decoder
- // tries to use more threads for the loopfilter, it will hurt performance
- // because of contention. If the multithreading code changes in the future
- // then the number of workers used by the loopfilter should be revisited.
+ // Filter all the vertical edges in the whole frame
+ for (i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+ lf_data->start = start + i * cm->mib_size;
+ lf_data->stop = stop;
+ lf_data->y_only = y_only;
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+ // Filter all the horizontal edges in the whole frame
+ for (i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+ lf_data->start = start + i * cm->mib_size;
+ lf_data->stop = stop;
+ lf_data->y_only = y_only;
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+#else // CONFIG_PARALLEL_DEBLOCKING
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
for (i = 0; i < num_workers; ++i) {
AVxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
@@ -228,6 +410,7 @@
for (i = 0; i < num_workers; ++i) {
winterface->sync(&workers[i]);
}
+#endif // CONFIG_PARALLEL_DEBLOCKING
}
void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
diff --git a/configure b/configure
index 5bfce04..fa458f7 100755
--- a/configure
+++ b/configure
@@ -289,6 +289,7 @@
delta_q
adapt_scan
filter_7bit
+ parallel_deblocking
"
CONFIG_LIST="
dependency_tracking