rtc: Reduce top-right dependency wait time

In non-rd rt multithreading encoding, the idle
time of threads is reduced by changing the
top right dependency wait in units of mi's
instead of superblocks. This change is enabled
via speed feature for speed >=9.

Change-Id: Ifc73b01947996d6b90a69553f239776b40bb2279
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index b2e72d2..88557b0 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -423,12 +423,16 @@
 
 static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
                                      TransformationType type) {
+  // As global mv is disabled in rt, return from the function before reading
+  // 'mbmi->bsize'. This prevents data race condition in multi-threaded
+  // realtime encoding as mbmi->bsize is updated in the function
+  // direct_partition_merging().
+  if (type <= TRANSLATION) return 0;
   const PREDICTION_MODE mode = mbmi->mode;
   const BLOCK_SIZE bsize = mbmi->bsize;
   const int block_size_allowed =
       AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
-         block_size_allowed;
+  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && block_size_allowed;
 }
 
 #if CONFIG_MISMATCH_DEBUG
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index e918e15..54ac13d 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -754,33 +754,6 @@
   }
 }
 
-// Check if the cost update of symbols mode, coeff and dv are tile or off.
-static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
-    const AV1_COMP *const cpi) {
-  const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
-
-  return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
-          inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
-          cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
-}
-
-// When row-mt is enabled and cost update frequencies are set to off/tile,
-// processing of current SB can start even before processing of top-right SB
-// is finished. This function checks if it is sufficient to wait for top SB
-// to finish processing before current SB starts processing.
-static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
-  const MODE mode = cpi->oxcf.mode;
-  if (mode == GOOD) return 0;
-
-  if (mode == ALLINTRA)
-    return is_mode_coeff_dv_upd_freq_tile_or_off(cpi);
-  else if (mode == REALTIME)
-    return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) &&
-            cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
-  else
-    return 0;
-}
-
 /*!\brief Determine whether grading content is needed based on sf and frame stat
  *
  * \ingroup partition_search
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 3a0df60..462600c 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -565,6 +565,39 @@
   assert(total_valid_refs <= max_allowed_refs);
 }
 
+// Check if the cost update of symbols mode, coeff and dv are tile or off.
+static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
+    const AV1_COMP *const cpi) {
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+
+  return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+          inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+          cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+}
+
+// When row-mt is enabled and cost update frequencies are set to off/tile,
+// processing of current SB can start even before processing of top-right SB
+// is finished. This function checks if it is sufficient to wait for top SB
+// to finish processing before current SB starts processing.
+static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
+  const MODE mode = cpi->oxcf.mode;
+  if (mode == GOOD) return 0;
+
+  if (mode == ALLINTRA)
+    return is_mode_coeff_dv_upd_freq_tile_or_off(cpi);
+  else if (mode == REALTIME)
+    return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) &&
+            cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+  else
+    return 0;
+}
+
+// This function checks if top right dependency wait at mi level can be enabled.
+static AOM_INLINE int enable_top_right_sync_wait_in_mis(const AV1_COMP *cpi,
+                                                        int seg_skip_active) {
+  return cpi->sf.rt_sf.top_right_sync_wait_in_mis && !seg_skip_active &&
+         delay_wait_for_top_right_sb(cpi);
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index e374589..a8c1914 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1364,11 +1364,17 @@
 #endif  // CONFIG_MULTITHREAD
   /*!
    * Buffer to store the superblock whose encoding is complete.
-   * cur_col[i] stores the number of superblocks which finished encoding in the
-   * ith superblock row.
+   * num_finished_cols[i] stores the number of superblocks which finished
+   * encoding in the ith superblock row.
    */
   int *num_finished_cols;
   /*!
+   * Buffer to store the mi position of the block whose encoding is complete.
+   * finished_block_in_mi[i] stores the mi position of the block which finished
+   * encoding in the ith superblock row.
+   */
+  int *finished_block_in_mi;
+  /*!
    * Denotes the superblock interval at which conditional signalling should
    * happen. Also denotes the minimum number of extra superblocks of the top row
    * to be complete to start encoding the current superblock. A value of 1
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 7be7768..02d638f 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -187,6 +187,9 @@
 
   CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
                   aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
+  CHECK_MEM_ERROR(
+      cm, row_mt_sync->finished_block_in_mi,
+      aom_malloc(sizeof(*row_mt_sync->finished_block_in_mi) * rows));
 
   row_mt_sync->rows = rows;
   // Set up nsync.
@@ -213,6 +216,7 @@
     }
 #endif  // CONFIG_MULTITHREAD
     aom_free(row_mt_sync->num_finished_cols);
+    aom_free(row_mt_sync->finished_block_in_mi);
 
     // clear the structure as the source of this call may be dynamic change
     // in tiles in which case this call will be followed by an _alloc()
@@ -1559,6 +1563,8 @@
       // Initialize num_finished_cols to -1 for all rows.
       memset(row_mt_sync->num_finished_cols, -1,
              sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows);
+      memset(row_mt_sync->finished_block_in_mi, -1,
+             sizeof(*row_mt_sync->finished_block_in_mi) * max_sb_rows);
       row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
       row_mt_sync->num_threads_working = 0;
       row_mt_sync->intrabc_extra_top_right_sb_delay =
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 4dd47e0..1e56e07 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -751,18 +751,9 @@
     av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
 }
 
-// For real time/allintra row-mt enabled multi-threaded encoding with cost
-// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated
-// at superblock level. Thus, it is not required for the encoding of top-right
-// superblock be complete for updating tile ctxt. However, when encoding a block
-// whose right edge is also the superblock edge, intra and inter mode evaluation
-// (ref mv list population) require the encoding of the top-right superblock to
-// be complete. So, here, we delay the waiting of threads until the need for the
-// data from the top-right superblock region.
-static AOM_INLINE void wait_for_top_right_sb(
-    AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
-    TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
-    BLOCK_SIZE bsize, int mi_row, int mi_col) {
+static AOM_INLINE int is_top_right_block_in_sb(BLOCK_SIZE sb_size,
+                                               BLOCK_SIZE bsize, int mi_row,
+                                               int mi_col) {
   const int sb_size_in_mi = mi_size_wide[sb_size];
   const int bw_in_mi = mi_size_wide[bsize];
   const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
@@ -770,16 +761,103 @@
   const int top_right_block_in_sb =
       (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
 
-  // Don't wait if the block is the not the top-right block in the superblock.
-  if (!top_right_block_in_sb) return;
+  return top_right_block_in_sb;
+}
 
-  // Wait for the top-right superblock to finish encoding.
+// For real time/allintra row-mt enabled multi-threaded encoding with cost
+// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated
+// at superblock level. Thus, it is not required for the encoding of top-right
+// superblock be complete for updating tile ctxt. However, when encoding a block
+// whose right edge is also the superblock edge, intra and inter mode evaluation
+// (ref mv list population) require the encoding of the top-right region to
+// be complete. So, here, we delay the waiting of threads until the need for the
+// data from the top-right superblock region.
+static AOM_INLINE void wait_for_top_right(AV1_COMP *cpi,
+                                          AV1EncRowMultiThreadSync *row_mt_sync,
+                                          TileInfo *tile_info,
+                                          BLOCK_SIZE sb_size, BLOCK_SIZE bsize,
+                                          int mi_row, int mi_col,
+                                          int seg_skip_active) {
+  // Don't wait if the block is the not the top-right block in the superblock.
+  if (!is_top_right_block_in_sb(sb_size, bsize, mi_row, mi_col)) return;
+
+  AV1EncRowMultiThreadInfo *enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int sb_mi_size_log2 = mi_size_wide_log2[sb_size];
   const int sb_row_in_tile =
       (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
-  const int sb_col_in_tile =
-      (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
 
-  enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+  // In case of non-rd RT with row-mt enabled, encoding of SB can start after
+  // encoding of bottom left block in above right superblock is complete. This
+  // is because only DC, H and V intra modes are enabled via the speed feature
+  // intra_y_mode_bsize_mask_nrd (above right region not required) and reference
+  // mv list population requires only the above right block info.
+  if (enable_top_right_sync_wait_in_mis(cpi, seg_skip_active)) {
+    const int *intra_y_mode_bsize_mask_nrd =
+        cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd;
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      assert(intra_y_mode_bsize_mask_nrd[i] == INTRA_DC ||
+             intra_y_mode_bsize_mask_nrd[i] == INTRA_DC_H_V);
+    (void)intra_y_mode_bsize_mask_nrd;
+#if CONFIG_MULTITHREAD
+    const int mi_col_in_tile = mi_col - tile_info->mi_col_start;
+    const int mi_cols_in_tile = tile_info->mi_col_end - tile_info->mi_col_start;
+    const int bw_in_mi = mi_size_wide[bsize];
+    if (sb_row_in_tile) {
+      pthread_mutex_t *const mutex = &row_mt_sync->mutex_[sb_row_in_tile - 1];
+      pthread_mutex_lock(mutex);
+
+      while (AOMMIN(mi_col_in_tile + bw_in_mi, mi_cols_in_tile) >=
+             row_mt_sync->finished_block_in_mi[sb_row_in_tile - 1]) {
+        pthread_cond_wait(&row_mt_sync->cond_[sb_row_in_tile - 1], mutex);
+      }
+      pthread_mutex_unlock(mutex);
+    }
+#endif
+  } else {
+    const int sb_col_in_tile =
+        (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
+    enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+  }
+}
+
+static AOM_INLINE void write_completed_mi_pos(
+    AV1EncRowMultiThreadSync *row_mt_sync, TileInfo *tile_info,
+    BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int sb_size_in_mi = mi_size_high[sb_size];
+  const int bh_in_mi = mi_size_high[bsize];
+  const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
+  const int bottom_block_in_sb = blk_row_in_sb + bh_in_mi >= sb_size_in_mi;
+
+  // Don't write if the block is the not the bottom block in the
+  // superblock.
+  if (!bottom_block_in_sb) return;
+
+#if CONFIG_MULTITHREAD
+  const int sb_mi_size_log2 = mi_size_wide_log2[sb_size];
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
+  const int bw_in_mi = mi_size_wide[bsize];
+  const int mi_col_in_tile = mi_col + bw_in_mi - tile_info->mi_col_start;
+  const int mi_cols_in_tile = tile_info->mi_col_end - tile_info->mi_col_start;
+
+  const int finished_mi_col = mi_col_in_tile < mi_cols_in_tile - 1
+                                  ? mi_col_in_tile
+                                  : mi_cols_in_tile + 1;
+
+  pthread_mutex_lock(&row_mt_sync->mutex_[sb_row_in_tile]);
+
+  row_mt_sync->finished_block_in_mi[sb_row_in_tile] = finished_mi_col;
+
+  pthread_cond_signal(&row_mt_sync->cond_[sb_row_in_tile]);
+  pthread_mutex_unlock(&row_mt_sync->mutex_[sb_row_in_tile]);
+#else
+  (void)row_mt_sync;
+  (void)tile_info;
+  (void)sb_size;
+  (void)bsize;
+  (void)mi_row;
+  (void)mi_col;
+#endif  // CONFIG_MULTITHREAD
 }
 
 /*!\brief Interface for AV1 mode search for an individual coding block
@@ -852,9 +930,8 @@
 
   // This is only needed for real time/allintra row-mt enabled multi-threaded
   // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
-  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
-                        &tile_data->tile_info, cm->seq_params->sb_size,
-                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+  wait_for_top_right(cpi, &tile_data->row_mt_sync, &tile_data->tile_info,
+                     cm->seq_params->sb_size, bsize, mi_row, mi_col, 0);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, rd_pick_sb_modes_time);
@@ -2139,6 +2216,11 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
 #endif
+  const int seg_skip_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  if (!dry_run && enable_top_right_sync_wait_in_mis(cpi, seg_skip_active))
+    write_completed_mi_pos(&tile_data->row_mt_sync, &tile_data->tile_info,
+                           cm->seq_params->sb_size, bsize, mi_row, mi_col);
 }
 
 /*!\brief Top level function to pick block mode for non-RD optimized case
@@ -2195,12 +2277,6 @@
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int i;
 
-  // This is only needed for real time/allintra row-mt enabled multi-threaded
-  // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
-  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
-                        &tile_data->tile_info, cm->seq_params->sb_size,
-                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
-
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, pick_sb_modes_nonrd_time);
 #endif
@@ -2224,6 +2300,15 @@
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
   // Set error per bit for current rdmult
   av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+
+  const int seg_skip_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  // This is only needed for real time/allintra row-mt enabled multi-threaded
+  // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+  wait_for_top_right(cpi, &tile_data->row_mt_sync, &tile_data->tile_info,
+                     cm->seq_params->sb_size, bsize, mi_row, mi_col,
+                     seg_skip_active);
+
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
@@ -2238,7 +2323,7 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, nonrd_pick_inter_mode_sb_time);
 #endif
-    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    if (seg_skip_active) {
       RD_STATS invalid_rd;
       av1_invalid_rd_stats(&invalid_rd);
       // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip
@@ -2463,7 +2548,15 @@
     // Update mi for this partition block.
     for (int y = 0; y < bs; y++) {
       for (int x_idx = 0; x_idx < bs; x_idx++) {
-        this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0];
+        this_mi[x_idx + y * mi_params->mi_stride]->bsize = this_mi[0]->bsize;
+        this_mi[x_idx + y * mi_params->mi_stride]->partition =
+            this_mi[0]->partition;
+        this_mi[x_idx + y * mi_params->mi_stride]->skip_txfm =
+            this_mi[0]->skip_txfm;
+        this_mi[x_idx + y * mi_params->mi_stride]->tx_size =
+            this_mi[0]->tx_size;
+        memcpy(this_mi[x_idx + y * mi_params->mi_stride]->inter_tx_size,
+               this_mi[0]->inter_tx_size, sizeof(this_mi[0]->inter_tx_size));
       }
     }
   }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 0dd3d97..5f3a949 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1702,6 +1702,14 @@
     sf->rt_sf.frame_level_mode_cost_update = true;
     sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
     sf->rt_sf.reduce_mv_pel_precision = 0;
+    // For multi-thread use case with row_mt enabled, enable top right
+    // dependency wait of threads at mi level.
+    if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) {
+      sf->rt_sf.top_right_sync_wait_in_mis =
+          frame_is_intra_only(cm) ? 0
+                                  : (!cpi->oxcf.tool_cfg.enable_global_motion &&
+                                     cpi->sf.rt_sf.use_nonrd_pick_mode);
+    }
   }
   if (speed >= 10) {
     sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
@@ -2045,6 +2053,7 @@
   rt_sf->check_only_zero_zeromv_on_large_blocks = false;
   rt_sf->disable_cdf_update_non_reference_frame = false;
   rt_sf->prune_compoundmode_with_singlemode_var = false;
+  rt_sf->top_right_sync_wait_in_mis = false;
 }
 
 // Populate appropriate sub-pel search method based on speed feature and user
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 3c87384..3d50d1c 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1601,6 +1601,10 @@
 
   // Prune compound modes if the single modes variances do not perform well.
   bool prune_compoundmode_with_singlemode_var;
+
+  // In multi-threaded encoding, enable top right dependency wait of threads at
+  // mi level.
+  bool top_right_sync_wait_in_mis;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */