Multi-thread support for bitmask

Enable multi-thread when loop filter bitmask is used.

Similar as existing multi-thread without bitmask,
each row (every 64x64 block, instead of previous 128x128) uses a thread.

Building bitmask for the whole frame can only use one thread right now.


Change-Id: Ibd753b677ecf2d04ac0dcccf6a06b5fcd97a7c23
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 4ea4e16..3129ce5 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -175,6 +175,22 @@
                                      int mi_row, int mi_col);
 int get_index_shift(int mi_col, int mi_row, int *index);
 
+void av1_build_bitmask_vert_info(
+    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane);
+
+void av1_build_bitmask_horz_info(
+    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane);
+
+void av1_filter_block_plane_bitmask_vert(
+    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+    int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_bitmask_horz(
+    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+    int pl, int mi_row, int mi_col);
+
 extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
 
 extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 8df4c9a..064cd67 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -205,7 +205,11 @@
 }
 
 static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
-                            int stop, int plane_start, int plane_end) {
+                            int stop,
+#if LOOP_FILTER_BITMASK
+                            int is_decoding,
+#endif
+                            int plane_start, int plane_end) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
@@ -219,7 +223,16 @@
         continue;
       else if (plane == 2 && !(cm->lf.filter_level_v))
         continue;
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+#if LOOP_FILTER_BITMASK
+      int step = MAX_MIB_SIZE;
+      if (is_decoding) {
+        step = MI_SIZE_64X64;
+      }
+      for (mi_row = start; mi_row < stop; mi_row += step)
+#else
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE)
+#endif
+      {
         lf_job_queue->mi_row = mi_row;
         lf_job_queue->plane = plane;
         lf_job_queue->dir = dir;
@@ -312,15 +325,94 @@
   return 1;
 }
 
+#if LOOP_FILTER_BITMASK
+static INLINE void thread_loop_filter_bitmask_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
+  const int sb_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MIN_MIB_SIZE_LOG2) >> MIN_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+  (void)xd;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MIN_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_SIZE_64X64) {
+          c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
+                                              mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_SIZE_64X64) {
+          c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
+                                              mi_col);
+        }
+      }
+    } else {
+      break;
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+  thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm,
+                                  lf_data->planes, lf_data->xd, lf_sync);
+  return 1;
+}
+#endif  // LOOP_FILTER_BITMASK
+
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 MACROBLOCKD *xd, int start, int stop,
                                 int plane_start, int plane_end,
+#if LOOP_FILTER_BITMASK
+                                int is_decoding,
+#endif
                                 AVxWorker *workers, int nworkers,
                                 AV1LfSync *lf_sync) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+#if LOOP_FILTER_BITMASK
+  int sb_rows;
+  if (is_decoding) {
+    sb_rows =
+        ALIGN_POWER_OF_TWO(cm->mi_rows, MIN_MIB_SIZE_LOG2) >> MIN_MIB_SIZE_LOG2;
+  } else {
+    sb_rows =
+        ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  }
+#else
   // Number of superblock rows and cols
   const int sb_rows =
       ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+#endif
   const int num_workers = nworkers;
   int i;
 
@@ -336,14 +428,26 @@
            sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
   }
 
-  enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
+  enqueue_lf_jobs(lf_sync, cm, start, stop,
+#if LOOP_FILTER_BITMASK
+                  is_decoding,
+#endif
+                  plane_start, plane_end);
 
   // Set up loopfilter thread data.
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
+#if LOOP_FILTER_BITMASK
+    if (is_decoding) {
+      worker->hook = loop_filter_bitmask_row_worker;
+    } else {
+      worker->hook = loop_filter_row_worker;
+    }
+#else
     worker->hook = loop_filter_row_worker;
+#endif
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -366,8 +470,12 @@
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MACROBLOCKD *xd, int plane_start, int plane_end,
-                              int partial_frame, AVxWorker *workers,
-                              int num_workers, AV1LfSync *lf_sync) {
+                              int partial_frame,
+#if LOOP_FILTER_BITMASK
+                              int is_decoding,
+#endif
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -380,8 +488,37 @@
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
 
+#if LOOP_FILTER_BITMASK
+  if (is_decoding) {
+    cm->is_decoding = is_decoding;
+    // TODO(chengchen): currently use one thread to build bitmasks for the
+    // frame. Make it support multi-thread later.
+    for (int plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      // TODO(chengchen): can we remove this?
+      struct macroblockd_plane *pd = xd->plane;
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
+                           plane + 1);
+
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+    }
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                        plane_end, 1, workers, num_workers, lf_sync);
+  } else {
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                        plane_end, 0, workers, num_workers, lf_sync);
+  }
+#else
   loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
                       plane_end, workers, num_workers, lf_sync);
+#endif
 }
 
 static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 23d61d7..e7dbb8b 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -103,6 +103,9 @@
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                               struct macroblockd *mbd, int plane_start,
                               int plane_end, int partial_frame,
+#if LOOP_FILTER_BITMASK
+                              int is_decoding,
+#endif
                               AVxWorker *workers, int num_workers,
                               AV1LfSync *lf_sync);
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index cdda29c..1950d36 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5549,9 +5549,12 @@
   if (!cm->allow_intrabc && !cm->single_tile_decoding) {
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
       if (pbi->num_workers > 1) {
-        av1_loop_filter_frame_mt(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
-                                 num_planes, 0, pbi->tile_workers,
-                                 pbi->num_workers, &pbi->lf_row_sync);
+        av1_loop_filter_frame_mt(
+            get_frame_new_buffer(cm), cm, &pbi->mb, 0, num_planes, 0,
+#if LOOP_FILTER_BITMASK
+            1,
+#endif
+            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
       } else {
         av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
 #if LOOP_FILTER_BITMASK
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 487efd2..ab9d0bc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4229,6 +4229,9 @@
   if (lf->filter_level[0] || lf->filter_level[1]) {
     if (cpi->num_workers > 1)
       av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+#if LOOP_FILTER_BITMASK
+                               0,
+#endif
                                cpi->workers, cpi->num_workers,
                                &cpi->lf_row_sync);
     else
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index c99d0a2..5bd0e60 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -71,8 +71,11 @@
   // filter mask is compatible with multi-thread.
   if (cpi->num_workers > 1)
     av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
-                             plane + 1, partial_frame, cpi->workers,
-                             cpi->num_workers, &cpi->lf_row_sync);
+                             plane + 1, partial_frame,
+#if LOOP_FILTER_BITMASK
+                             0,
+#endif
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
   else
     av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd,
 #if LOOP_FILTER_BITMASK