Improve pack bitstream multi-thread performance

This CL improves pack bitstream multi-thread performance by
accounting setup time overhead and job dispatch time overhead for
the number of tiles and available number of workers.

Change-Id: I98e4b0df684e40262ce87ccde24522d98080a825
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index f4c20b0..7093f3e 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -48,6 +48,8 @@
 #include "av1/encoder/tokenize.h"
 
 #define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5     // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1  // Job dispatch time overhead per tile
 
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -3918,19 +3920,36 @@
 // As per the experiments, single-thread bitstream packing is better for
 // frames with a smaller bitstream size. This behavior is due to setup time
 // overhead of multithread function would be more than that of time required
-// to pack the smaller bitstream of such frames. We set a threshold on the
-// total absolute sum of transform coeffs to detect such frames and disable
-// Multithreading.
-int enable_pack_bitstream_mt(const TileDataEnc *tile_data, int num_tiles,
-                             int num_workers) {
-  if (AOMMIN(num_workers, num_tiles) <= 1) return 0;
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+                            int avail_workers) {
+  if (AOMMIN(avail_workers, num_tiles) <= 1) return 1;
 
-  const int num_work_sqr = num_workers * num_workers;
-  const uint64_t thresh = 50;
   uint64_t frame_abs_sum_level = 0;
+
   for (int idx = 0; idx < num_tiles; idx++)
     frame_abs_sum_level += tile_data[idx].abs_sum_level;
-  return ((frame_abs_sum_level > (num_work_sqr * thresh) / (num_workers - 1)));
+
+  aom_clear_system_state();
+  int ideal_num_workers = 1;
+  const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+  float max_sum = 0.0;
+
+  for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+    const float fas_per_worker_const =
+        ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+    const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+    const float this_sum = fas_per_worker_const - setup_time_const -
+                           job_disp_time_const / num_workers;
+
+    if (this_sum > max_sum) {
+      max_sum = this_sum;
+      ideal_num_workers = num_workers;
+    }
+  }
+  return ideal_num_workers;
 }
 
 static INLINE uint32_t pack_tiles_in_tg_obus(
@@ -3942,18 +3961,17 @@
   unsigned int max_tile_size = 0;
   uint32_t obu_header_size = 0;
   uint8_t *tile_data_start = dst;
-  const int num_workers = cpi->mt_info.num_mod_workers[MOD_PACK_BS];
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
   const int num_tiles = tile_rows * tile_cols;
 
-  const int enable_mt =
-      enable_pack_bitstream_mt(cpi->tile_data, num_tiles, num_workers);
+  const int num_workers = calc_pack_bs_mt_workers(
+      cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS]);
 
-  if (enable_mt) {
+  if (num_workers > 1) {
     av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
                           fh_info, largest_tile_id, &max_tile_size,
-                          &obu_header_size, &tile_data_start);
+                          &obu_header_size, &tile_data_start, num_workers);
   } else {
     write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
                    fh_info, largest_tile_id, &max_tile_size, &obu_header_size,