Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Yaowu Xu | 2ab7ff0 | 2016-09-02 12:04:54 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | 2ab7ff0 | 2016-09-02 12:04:54 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 12 | #include "./aom_config.h" |
| 13 | #include "aom_dsp/aom_dsp_common.h" |
| 14 | #include "aom_mem/aom_mem.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 15 | #include "av1/common/entropymode.h" |
| 16 | #include "av1/common/thread_common.h" |
| 17 | #include "av1/common/reconinter.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 18 | |
| 19 | #if CONFIG_MULTITHREAD |
| 20 | static INLINE void mutex_lock(pthread_mutex_t *const mutex) { |
| 21 | const int kMaxTryLocks = 4000; |
| 22 | int locked = 0; |
| 23 | int i; |
| 24 | |
| 25 | for (i = 0; i < kMaxTryLocks; ++i) { |
| 26 | if (!pthread_mutex_trylock(mutex)) { |
| 27 | locked = 1; |
| 28 | break; |
| 29 | } |
| 30 | } |
| 31 | |
| 32 | if (!locked) pthread_mutex_lock(mutex); |
| 33 | } |
| 34 | #endif // CONFIG_MULTITHREAD |
| 35 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 36 | static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 37 | #if CONFIG_MULTITHREAD |
| 38 | const int nsync = lf_sync->sync_range; |
| 39 | |
| 40 | if (r && !(c & (nsync - 1))) { |
| 41 | pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; |
| 42 | mutex_lock(mutex); |
| 43 | |
| 44 | while (c > lf_sync->cur_sb_col[r - 1] - nsync) { |
| 45 | pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); |
| 46 | } |
| 47 | pthread_mutex_unlock(mutex); |
| 48 | } |
| 49 | #else |
| 50 | (void)lf_sync; |
| 51 | (void)r; |
| 52 | (void)c; |
| 53 | #endif // CONFIG_MULTITHREAD |
| 54 | } |
| 55 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 56 | static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 57 | const int sb_cols) { |
| 58 | #if CONFIG_MULTITHREAD |
| 59 | const int nsync = lf_sync->sync_range; |
| 60 | int cur; |
| 61 | // Only signal when there are enough filtered SB for next row to run. |
| 62 | int sig = 1; |
| 63 | |
| 64 | if (c < sb_cols - 1) { |
| 65 | cur = c; |
| 66 | if (c % nsync) sig = 0; |
| 67 | } else { |
| 68 | cur = sb_cols + nsync; |
| 69 | } |
| 70 | |
| 71 | if (sig) { |
| 72 | mutex_lock(&lf_sync->mutex_[r]); |
| 73 | |
| 74 | lf_sync->cur_sb_col[r] = cur; |
| 75 | |
| 76 | pthread_cond_signal(&lf_sync->cond_[r]); |
| 77 | pthread_mutex_unlock(&lf_sync->mutex_[r]); |
| 78 | } |
| 79 | #else |
| 80 | (void)lf_sync; |
| 81 | (void)r; |
| 82 | (void)c; |
| 83 | (void)sb_cols; |
| 84 | #endif // CONFIG_MULTITHREAD |
| 85 | } |
| 86 | |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 87 | #if !CONFIG_EXT_PARTITION_TYPES |
| 88 | static INLINE enum lf_path get_loop_filter_path( |
| 89 | int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) { |
| 90 | if (y_only) |
| 91 | return LF_PATH_444; |
| 92 | else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) |
| 93 | return LF_PATH_420; |
| 94 | else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) |
| 95 | return LF_PATH_444; |
| 96 | else |
| 97 | return LF_PATH_SLOW; |
| 98 | } |
| 99 | |
| 100 | static INLINE void loop_filter_block_plane_ver( |
| 101 | AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, |
| 102 | MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path, |
| 103 | LOOP_FILTER_MASK *lfm) { |
| 104 | if (plane == 0) { |
| 105 | av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm); |
| 106 | } else { |
| 107 | switch (path) { |
| 108 | case LF_PATH_420: |
| 109 | av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm); |
| 110 | break; |
| 111 | case LF_PATH_444: |
| 112 | av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm); |
| 113 | break; |
| 114 | case LF_PATH_SLOW: |
Ryan Lei | 6f8c1a7 | 2016-10-26 10:52:12 -0700 | [diff] [blame] | 115 | av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row, |
Jingning Han | 6e4955d | 2017-05-30 22:54:48 -0700 | [diff] [blame] | 116 | mi_col, plane); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 117 | break; |
| 118 | } |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | static INLINE void loop_filter_block_plane_hor( |
| 123 | AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, |
Ryan Lei | 6f8c1a7 | 2016-10-26 10:52:12 -0700 | [diff] [blame] | 124 | MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path, |
| 125 | LOOP_FILTER_MASK *lfm) { |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 126 | if (plane == 0) { |
| 127 | av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm); |
| 128 | } else { |
| 129 | switch (path) { |
| 130 | case LF_PATH_420: |
| 131 | av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm); |
| 132 | break; |
| 133 | case LF_PATH_444: |
| 134 | av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm); |
| 135 | break; |
| 136 | case LF_PATH_SLOW: |
Ryan Lei | 6f8c1a7 | 2016-10-26 10:52:12 -0700 | [diff] [blame] | 137 | av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row, |
Jingning Han | 6e4955d | 2017-05-30 22:54:48 -0700 | [diff] [blame] | 138 | mi_col, plane); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 139 | break; |
| 140 | } |
| 141 | } |
| 142 | } |
| 143 | #endif |
| 144 | // Row-based multi-threaded loopfilter hook |
| 145 | #if CONFIG_PARALLEL_DEBLOCKING |
| 146 | static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync, |
| 147 | LFWorkerData *const lf_data) { |
| 148 | const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 149 | int mi_row, mi_col; |
| 150 | #if !CONFIG_EXT_PARTITION_TYPES |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 151 | enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); |
| 152 | #endif |
| 153 | for (mi_row = lf_data->start; mi_row < lf_data->stop; |
| 154 | mi_row += lf_sync->num_workers * lf_data->cm->mib_size) { |
| 155 | MODE_INFO **const mi = |
| 156 | lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; |
| 157 | |
| 158 | for (mi_col = 0; mi_col < lf_data->cm->mi_cols; |
| 159 | mi_col += lf_data->cm->mib_size) { |
| 160 | LOOP_FILTER_MASK lfm; |
| 161 | int plane; |
| 162 | |
Jingning Han | 91d9a79 | 2017-04-18 12:01:52 -0700 | [diff] [blame] | 163 | av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size, |
| 164 | lf_data->frame_buffer, mi_row, mi_col); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 165 | av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, |
| 166 | lf_data->cm->mi_stride, &lfm); |
| 167 | |
| 168 | #if CONFIG_EXT_PARTITION_TYPES |
| 169 | for (plane = 0; plane < num_planes; ++plane) |
| 170 | av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane], |
Jingning Han | 6e4955d | 2017-05-30 22:54:48 -0700 | [diff] [blame] | 171 | mi + mi_col, mi_row, mi_col, plane); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 172 | #else |
| 173 | |
| 174 | for (plane = 0; plane < num_planes; ++plane) |
Ryan Lei | 6f8c1a7 | 2016-10-26 10:52:12 -0700 | [diff] [blame] | 175 | loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, |
| 176 | mi + mi_col, mi_row, mi_col, path, &lfm); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 177 | #endif |
| 178 | } |
| 179 | } |
| 180 | return 1; |
| 181 | } |
| 182 | |
| 183 | static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync, |
| 184 | LFWorkerData *const lf_data) { |
| 185 | const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; |
| 186 | const int sb_cols = |
| 187 | mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2; |
| 188 | int mi_row, mi_col; |
| 189 | #if !CONFIG_EXT_PARTITION_TYPES |
| 190 | enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); |
| 191 | #endif |
| 192 | |
| 193 | for (mi_row = lf_data->start; mi_row < lf_data->stop; |
| 194 | mi_row += lf_sync->num_workers * lf_data->cm->mib_size) { |
| 195 | MODE_INFO **const mi = |
| 196 | lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; |
| 197 | |
| 198 | for (mi_col = 0; mi_col < lf_data->cm->mi_cols; |
| 199 | mi_col += lf_data->cm->mib_size) { |
| 200 | const int r = mi_row >> lf_data->cm->mib_size_log2; |
| 201 | const int c = mi_col >> lf_data->cm->mib_size_log2; |
| 202 | LOOP_FILTER_MASK lfm; |
| 203 | int plane; |
| 204 | |
| 205 | // TODO(wenhao.zhang@intel.com): For better parallelization, reorder |
| 206 | // the outer loop to column-based and remove the synchronizations here. |
| 207 | sync_read(lf_sync, r, c); |
| 208 | |
Jingning Han | 91d9a79 | 2017-04-18 12:01:52 -0700 | [diff] [blame] | 209 | av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size, |
| 210 | lf_data->frame_buffer, mi_row, mi_col); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 211 | av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, |
| 212 | lf_data->cm->mi_stride, &lfm); |
| 213 | #if CONFIG_EXT_PARTITION_TYPES |
| 214 | for (plane = 0; plane < num_planes; ++plane) |
| 215 | av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], |
Jingning Han | 6e4955d | 2017-05-30 22:54:48 -0700 | [diff] [blame] | 216 | mi + mi_col, mi_row, mi_col, plane); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 217 | #else |
| 218 | for (plane = 0; plane < num_planes; ++plane) |
Ryan Lei | 6f8c1a7 | 2016-10-26 10:52:12 -0700 | [diff] [blame] | 219 | loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, |
| 220 | mi + mi_col, mi_row, mi_col, path, &lfm); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 221 | #endif |
| 222 | sync_write(lf_sync, r, c, sb_cols); |
| 223 | } |
| 224 | } |
| 225 | return 1; |
| 226 | } |
| 227 | #else // CONFIG_PARALLEL_DEBLOCKING |
| 228 | static int loop_filter_row_worker(AV1LfSync *const lf_sync, |
| 229 | LFWorkerData *const lf_data) { |
| 230 | const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; |
| 231 | const int sb_cols = |
| 232 | mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2; |
| 233 | int mi_row, mi_col; |
| 234 | #if !CONFIG_EXT_PARTITION_TYPES |
| 235 | enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 236 | #endif // !CONFIG_EXT_PARTITION_TYPES |
| 237 | |
| 238 | #if CONFIG_EXT_PARTITION |
| 239 | printf( |
| 240 | "STOPPING: This code has not been modified to work with the " |
| 241 | "extended coding unit size experiment"); |
| 242 | exit(EXIT_FAILURE); |
| 243 | #endif // CONFIG_EXT_PARTITION |
| 244 | |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 245 | for (mi_row = lf_data->start; mi_row < lf_data->stop; |
| 246 | mi_row += lf_sync->num_workers * lf_data->cm->mib_size) { |
| 247 | MODE_INFO **const mi = |
| 248 | lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 249 | |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 250 | for (mi_col = 0; mi_col < lf_data->cm->mi_cols; |
| 251 | mi_col += lf_data->cm->mib_size) { |
| 252 | const int r = mi_row >> lf_data->cm->mib_size_log2; |
| 253 | const int c = mi_col >> lf_data->cm->mib_size_log2; |
| 254 | #if !CONFIG_EXT_PARTITION_TYPES |
| 255 | LOOP_FILTER_MASK lfm; |
| 256 | #endif |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 257 | int plane; |
| 258 | |
| 259 | sync_read(lf_sync, r, c); |
| 260 | |
Jingning Han | 91d9a79 | 2017-04-18 12:01:52 -0700 | [diff] [blame] | 261 | av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size, |
| 262 | lf_data->frame_buffer, mi_row, mi_col); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 263 | #if CONFIG_EXT_PARTITION_TYPES |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 264 | for (plane = 0; plane < num_planes; ++plane) { |
| 265 | av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane], |
Jingning Han | 6e4955d | 2017-05-30 22:54:48 -0700 | [diff] [blame] | 266 | mi + mi_col, mi_row, mi_col, plane); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 267 | av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], |
Jingning Han | 6e4955d | 2017-05-30 22:54:48 -0700 | [diff] [blame] | 268 | mi + mi_col, mi_row, mi_col, plane); |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 269 | } |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 270 | #else |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 271 | av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, |
| 272 | lf_data->cm->mi_stride, &lfm); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 273 | |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 274 | for (plane = 0; plane < num_planes; ++plane) { |
Ryan Lei | 6f8c1a7 | 2016-10-26 10:52:12 -0700 | [diff] [blame] | 275 | loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, |
| 276 | mi + mi_col, mi_row, mi_col, path, &lfm); |
| 277 | loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, |
| 278 | mi + mi_col, mi_row, mi_col, path, &lfm); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 279 | } |
| 280 | #endif // CONFIG_EXT_PARTITION_TYPES |
| 281 | sync_write(lf_sync, r, c, sb_cols); |
| 282 | } |
| 283 | } |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 284 | return 1; |
| 285 | } |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 286 | #endif // CONFIG_PARALLEL_DEBLOCKING |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 287 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 288 | static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, |
Yaowu Xu | 989dd5b | 2017-10-11 21:59:46 -0700 | [diff] [blame] | 289 | struct macroblockd_plane *planes, int start, |
| 290 | int stop, int y_only, AVxWorker *workers, |
| 291 | int nworkers, AV1LfSync *lf_sync) { |
Debargha Mukherjee | e36a08c | 2017-10-08 21:17:31 -0700 | [diff] [blame] | 292 | #if CONFIG_EXT_PARTITION |
| 293 | printf( |
| 294 | "STOPPING: This code has not been modified to work with the " |
| 295 | "extended coding unit size experiment"); |
| 296 | exit(EXIT_FAILURE); |
| 297 | #endif // CONFIG_EXT_PARTITION |
| 298 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 299 | const AVxWorkerInterface *const winterface = aom_get_worker_interface(); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 300 | // Number of superblock rows and cols |
| 301 | const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2; |
| 302 | // Decoder may allocate more threads than number of tiles based on user's |
| 303 | // input. |
| 304 | const int tile_cols = cm->tile_cols; |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 305 | const int num_workers = AOMMIN(nworkers, tile_cols); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 306 | int i; |
| 307 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 308 | if (!lf_sync->sync_range || sb_rows != lf_sync->rows || |
| 309 | num_workers > lf_sync->num_workers) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 310 | av1_loop_filter_dealloc(lf_sync); |
| 311 | av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 312 | } |
| 313 | |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 314 | // Set up loopfilter thread data. |
| 315 | // The decoder is capping num_workers because it has been observed that using |
| 316 | // more threads on the loopfilter than there are cores will hurt performance |
| 317 | // on Android. This is because the system will only schedule the tile decode |
| 318 | // workers on cores equal to the number of tile columns. Then if the decoder |
| 319 | // tries to use more threads for the loopfilter, it will hurt performance |
| 320 | // because of contention. If the multithreading code changes in the future |
| 321 | // then the number of workers used by the loopfilter should be revisited. |
| 322 | |
| 323 | #if CONFIG_PARALLEL_DEBLOCKING |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 324 | // Initialize cur_sb_col to -1 for all SB rows. |
| 325 | memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); |
| 326 | |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 327 | // Filter all the vertical edges in the whole frame |
| 328 | for (i = 0; i < num_workers; ++i) { |
| 329 | AVxWorker *const worker = &workers[i]; |
| 330 | LFWorkerData *const lf_data = &lf_sync->lfdata[i]; |
| 331 | |
| 332 | worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker; |
| 333 | worker->data1 = lf_sync; |
| 334 | worker->data2 = lf_data; |
| 335 | |
| 336 | // Loopfilter data |
| 337 | av1_loop_filter_data_reset(lf_data, frame, cm, planes); |
| 338 | lf_data->start = start + i * cm->mib_size; |
| 339 | lf_data->stop = stop; |
| 340 | lf_data->y_only = y_only; |
| 341 | |
| 342 | // Start loopfiltering |
| 343 | if (i == num_workers - 1) { |
| 344 | winterface->execute(worker); |
| 345 | } else { |
| 346 | winterface->launch(worker); |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | // Wait till all rows are finished |
| 351 | for (i = 0; i < num_workers; ++i) { |
| 352 | winterface->sync(&workers[i]); |
| 353 | } |
| 354 | |
| 355 | memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); |
| 356 | // Filter all the horizontal edges in the whole frame |
| 357 | for (i = 0; i < num_workers; ++i) { |
| 358 | AVxWorker *const worker = &workers[i]; |
| 359 | LFWorkerData *const lf_data = &lf_sync->lfdata[i]; |
| 360 | |
| 361 | worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker; |
| 362 | worker->data1 = lf_sync; |
| 363 | worker->data2 = lf_data; |
| 364 | |
| 365 | // Loopfilter data |
| 366 | av1_loop_filter_data_reset(lf_data, frame, cm, planes); |
| 367 | lf_data->start = start + i * cm->mib_size; |
| 368 | lf_data->stop = stop; |
| 369 | lf_data->y_only = y_only; |
| 370 | |
| 371 | // Start loopfiltering |
| 372 | if (i == num_workers - 1) { |
| 373 | winterface->execute(worker); |
| 374 | } else { |
| 375 | winterface->launch(worker); |
| 376 | } |
| 377 | } |
| 378 | |
| 379 | // Wait till all rows are finished |
| 380 | for (i = 0; i < num_workers; ++i) { |
| 381 | winterface->sync(&workers[i]); |
| 382 | } |
| 383 | #else // CONFIG_PARALLEL_DEBLOCKING |
| 384 | // Initialize cur_sb_col to -1 for all SB rows. |
| 385 | memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); |
| 386 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 387 | for (i = 0; i < num_workers; ++i) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 388 | AVxWorker *const worker = &workers[i]; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 389 | LFWorkerData *const lf_data = &lf_sync->lfdata[i]; |
| 390 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 391 | worker->hook = (AVxWorkerHook)loop_filter_row_worker; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 392 | worker->data1 = lf_sync; |
| 393 | worker->data2 = lf_data; |
| 394 | |
| 395 | // Loopfilter data |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 396 | av1_loop_filter_data_reset(lf_data, frame, cm, planes); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 397 | lf_data->start = start + i * cm->mib_size; |
| 398 | lf_data->stop = stop; |
| 399 | lf_data->y_only = y_only; |
| 400 | |
| 401 | // Start loopfiltering |
| 402 | if (i == num_workers - 1) { |
| 403 | winterface->execute(worker); |
| 404 | } else { |
| 405 | winterface->launch(worker); |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | // Wait till all rows are finished |
| 410 | for (i = 0; i < num_workers; ++i) { |
| 411 | winterface->sync(&workers[i]); |
| 412 | } |
Ryan Lei | 1514948 | 2016-10-25 18:48:43 -0700 | [diff] [blame] | 413 | #endif // CONFIG_PARALLEL_DEBLOCKING |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 414 | } |
| 415 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 416 | void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, |
Yaowu Xu | 989dd5b | 2017-10-11 21:59:46 -0700 | [diff] [blame] | 417 | struct macroblockd_plane *planes, |
Cheng Chen | 179479f | 2017-08-04 10:56:39 -0700 | [diff] [blame] | 418 | int frame_filter_level, |
Cheng Chen | 13fc819 | 2017-08-19 11:49:28 -0700 | [diff] [blame] | 419 | #if CONFIG_LOOPFILTER_LEVEL |
Cheng Chen | 179479f | 2017-08-04 10:56:39 -0700 | [diff] [blame] | 420 | int frame_filter_level_r, |
| 421 | #endif |
| 422 | int y_only, int partial_frame, AVxWorker *workers, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 423 | int num_workers, AV1LfSync *lf_sync) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 424 | int start_mi_row, end_mi_row, mi_rows_to_filter; |
| 425 | |
| 426 | if (!frame_filter_level) return; |
| 427 | |
| 428 | start_mi_row = 0; |
| 429 | mi_rows_to_filter = cm->mi_rows; |
| 430 | if (partial_frame && cm->mi_rows > 8) { |
| 431 | start_mi_row = cm->mi_rows >> 1; |
| 432 | start_mi_row &= 0xfffffff8; |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 433 | mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 434 | } |
| 435 | end_mi_row = start_mi_row + mi_rows_to_filter; |
Cheng Chen | 13fc819 | 2017-08-19 11:49:28 -0700 | [diff] [blame] | 436 | #if CONFIG_LOOPFILTER_LEVEL |
Cheng Chen | d8184da | 2017-09-26 18:15:22 -0700 | [diff] [blame] | 437 | av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level_r, |
| 438 | y_only); |
Cheng Chen | 179479f | 2017-08-04 10:56:39 -0700 | [diff] [blame] | 439 | #else |
| 440 | av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level); |
Cheng Chen | 13fc819 | 2017-08-19 11:49:28 -0700 | [diff] [blame] | 441 | #endif // CONFIG_LOOPFILTER_LEVEL |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 442 | loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, y_only, |
| 443 | workers, num_workers, lf_sync); |
| 444 | } |
| 445 | |
| 446 | // Set up nsync by width. |
| 447 | static INLINE int get_sync_range(int width) { |
| 448 | // nsync numbers are picked by testing. For example, for 4k |
| 449 | // video, using 4 gives best performance. |
| 450 | if (width < 640) |
| 451 | return 1; |
| 452 | else if (width <= 1280) |
| 453 | return 2; |
| 454 | else if (width <= 4096) |
| 455 | return 4; |
| 456 | else |
| 457 | return 8; |
| 458 | } |
| 459 | |
| 460 | // Allocate memory for lf row synchronization |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 461 | void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, |
| 462 | int width, int num_workers) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 463 | lf_sync->rows = rows; |
| 464 | #if CONFIG_MULTITHREAD |
| 465 | { |
| 466 | int i; |
| 467 | |
| 468 | CHECK_MEM_ERROR(cm, lf_sync->mutex_, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 469 | aom_malloc(sizeof(*lf_sync->mutex_) * rows)); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 470 | if (lf_sync->mutex_) { |
| 471 | for (i = 0; i < rows; ++i) { |
| 472 | pthread_mutex_init(&lf_sync->mutex_[i], NULL); |
| 473 | } |
| 474 | } |
| 475 | |
| 476 | CHECK_MEM_ERROR(cm, lf_sync->cond_, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 477 | aom_malloc(sizeof(*lf_sync->cond_) * rows)); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 478 | if (lf_sync->cond_) { |
| 479 | for (i = 0; i < rows; ++i) { |
| 480 | pthread_cond_init(&lf_sync->cond_[i], NULL); |
| 481 | } |
| 482 | } |
| 483 | } |
| 484 | #endif // CONFIG_MULTITHREAD |
| 485 | |
| 486 | CHECK_MEM_ERROR(cm, lf_sync->lfdata, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 487 | aom_malloc(num_workers * sizeof(*lf_sync->lfdata))); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 488 | lf_sync->num_workers = num_workers; |
| 489 | |
| 490 | CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 491 | aom_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 492 | |
| 493 | // Set up nsync. |
| 494 | lf_sync->sync_range = get_sync_range(width); |
| 495 | } |
| 496 | |
| 497 | // Deallocate lf synchronization related mutex and data |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 498 | void av1_loop_filter_dealloc(AV1LfSync *lf_sync) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 499 | if (lf_sync != NULL) { |
| 500 | #if CONFIG_MULTITHREAD |
| 501 | int i; |
| 502 | |
| 503 | if (lf_sync->mutex_ != NULL) { |
| 504 | for (i = 0; i < lf_sync->rows; ++i) { |
| 505 | pthread_mutex_destroy(&lf_sync->mutex_[i]); |
| 506 | } |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 507 | aom_free(lf_sync->mutex_); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 508 | } |
| 509 | if (lf_sync->cond_ != NULL) { |
| 510 | for (i = 0; i < lf_sync->rows; ++i) { |
| 511 | pthread_cond_destroy(&lf_sync->cond_[i]); |
| 512 | } |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 513 | aom_free(lf_sync->cond_); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 514 | } |
| 515 | #endif // CONFIG_MULTITHREAD |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 516 | aom_free(lf_sync->lfdata); |
| 517 | aom_free(lf_sync->cur_sb_col); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 518 | // clear the structure as the source of this call may be a resize in which |
| 519 | // case this call will be followed by an _alloc() which may fail. |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 520 | av1_zero(*lf_sync); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 521 | } |
| 522 | } |
| 523 | |
| 524 | // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' |
| 525 | // members, so we treat it as an array, and sum over the whole length. |
Debargha Mukherjee | 5802ebe | 2016-12-21 04:17:24 -0800 | [diff] [blame] | 526 | void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, |
| 527 | FRAME_COUNTS *counts) { |
| 528 | unsigned int *const acc = (unsigned int *)acc_counts; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 529 | const unsigned int *const cnt = (unsigned int *)counts; |
| 530 | |
| 531 | const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); |
| 532 | unsigned int i; |
| 533 | |
| 534 | for (i = 0; i < n_counts; i++) acc[i] += cnt[i]; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 535 | } |