Improve pack bitstream multi-thread performance This CL improves pack bitstream multi-thread performance by accounting setup time overhead and job dispatch time overhead for the number of tiles and available number of workers. Change-Id: I98e4b0df684e40262ce87ccde24522d98080a825

commit: 248e8d3e2d19f10c7cbcf565258c56b121fb9a03 [log] [tgz]
author: Cherma Rajan A <cherma.rajan@ittiam.com> Tue Jun 15 09:06:38 2021 +0530
committer: Yunqing Wang <yunqingwang@google.com> Wed Jun 23 18:20:26 2021 +0000
tree: 788e1c3c23f30d8a67566ea7ddb346454a53f526
parent: a2fcd62acdcb5c7b5388248a7b1e0d2fc7e111e5 [diff] [blame]
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index f4c20b0..7093f3e 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c

@@ -48,6 +48,8 @@
 #include "av1/encoder/tokenize.h"
 
 #define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5     // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1  // Job dispatch time overhead per tile
 
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -3918,19 +3920,36 @@
 // As per the experiments, single-thread bitstream packing is better for
 // frames with a smaller bitstream size. This behavior is due to setup time
 // overhead of multithread function would be more than that of time required
-// to pack the smaller bitstream of such frames. We set a threshold on the
-// total absolute sum of transform coeffs to detect such frames and disable
-// Multithreading.
-int enable_pack_bitstream_mt(const TileDataEnc *tile_data, int num_tiles,
-                             int num_workers) {
-  if (AOMMIN(num_workers, num_tiles) <= 1) return 0;
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+                            int avail_workers) {
+  if (AOMMIN(avail_workers, num_tiles) <= 1) return 1;
 
-  const int num_work_sqr = num_workers * num_workers;
-  const uint64_t thresh = 50;
   uint64_t frame_abs_sum_level = 0;
+
   for (int idx = 0; idx < num_tiles; idx++)
     frame_abs_sum_level += tile_data[idx].abs_sum_level;
-  return ((frame_abs_sum_level > (num_work_sqr * thresh) / (num_workers - 1)));
+
+  aom_clear_system_state();
+  int ideal_num_workers = 1;
+  const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+  float max_sum = 0.0;
+
+  for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+    const float fas_per_worker_const =
+        ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+    const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+    const float this_sum = fas_per_worker_const - setup_time_const -
+                           job_disp_time_const / num_workers;
+
+    if (this_sum > max_sum) {
+      max_sum = this_sum;
+      ideal_num_workers = num_workers;
+    }
+  }
+  return ideal_num_workers;
 }
 
 static INLINE uint32_t pack_tiles_in_tg_obus(
@@ -3942,18 +3961,17 @@
   unsigned int max_tile_size = 0;
   uint32_t obu_header_size = 0;
   uint8_t *tile_data_start = dst;
-  const int num_workers = cpi->mt_info.num_mod_workers[MOD_PACK_BS];
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
   const int num_tiles = tile_rows * tile_cols;
 
-  const int enable_mt =
-      enable_pack_bitstream_mt(cpi->tile_data, num_tiles, num_workers);
+  const int num_workers = calc_pack_bs_mt_workers(
+      cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS]);
 
-  if (enable_mt) {
+  if (num_workers > 1) {
     av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
                           fh_info, largest_tile_id, &max_tile_size,
-                          &obu_header_size, &tile_data_start);
+                          &obu_header_size, &tile_data_start, num_workers);
   } else {
     write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
                    fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
commit	248e8d3e2d19f10c7cbcf565258c56b121fb9a03	[log] [tgz]
author	Cherma Rajan A <cherma.rajan@ittiam.com>	Tue Jun 15 09:06:38 2021 +0530
committer	Yunqing Wang <yunqingwang@google.com>	Wed Jun 23 18:20:26 2021 +0000
tree	788e1c3c23f30d8a67566ea7ddb346454a53f526
parent	a2fcd62acdcb5c7b5388248a7b1e0d2fc7e111e5 [diff] [blame]