Improve pack bitstream multi-thread performance

This CL improves pack bitstream multi-thread performance
by ordering tile jobs based on absolute sum of transform
coefficients, instead of tile area.

Change-Id: I0772eae1d889a76e162daa6cc8b576d89453e818
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index dfbe7bf..e32cd3b 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -59,8 +59,8 @@
 } PackBSParams;
 
 typedef struct {
+  uint64_t abs_sum_level;
   uint16_t tile_idx;
-  int tile_size_mi;
 } PackBSTileOrder;
 
 // Pack bitstream data for pack bitstream multi-threading.
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index c403d70..b7a0c27 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -944,6 +944,7 @@
       TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
       tile_data->firstpass_top_mv = kZeroMv;
+      tile_data->abs_sum_level = 0;
 
       if (pre_tok != NULL && tplist != NULL) {
         token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
@@ -1033,6 +1034,7 @@
        mi_row += cm->seq_params->mib_size) {
     av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
   }
+  this_tile->abs_sum_level = td->abs_sum_level;
 }
 
 /*!\brief Break one frame into tiles and encode the tiles
@@ -1061,6 +1063,7 @@
           &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
       cpi->td.intrabc_used = 0;
       cpi->td.deltaq_used = 0;
+      cpi->td.abs_sum_level = 0;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
       // Reset cyclic refresh counters.
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 938f930..98bcb6e 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1283,6 +1283,7 @@
   TileInfo tile_info;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   FRAME_CONTEXT *row_ctx;
+  uint64_t abs_sum_level;
   uint8_t allow_update_cdf;
   InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
   AV1EncRowMultiThreadSync row_mt_sync;
@@ -1311,6 +1312,7 @@
   PALETTE_BUFFER *palette_buffer;
   CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
+  uint64_t abs_sum_level;
   uint8_t *tmp_pred_bufs[2];
   int intrabc_used;
   int deltaq_used;
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 2582ba8..0eb1348 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -624,6 +624,7 @@
       const int coeff_ctx = coeff_contexts[pos];
       const tran_low_t v = qcoeff[pos];
       const tran_low_t level = abs(v);
+      td->abs_sum_level += level;
 
       if (allow_update_cdf) {
         if (c == eob - 1) {
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index fb428a1..3bf0960 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -480,6 +480,7 @@
 
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
+    td->abs_sum_level = 0;
 
     if (this_tile->allow_update_cdf) {
       td->mb.row_ctx = this_tile->row_ctx;
@@ -502,6 +503,7 @@
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
+    this_tile->abs_sum_level += td->abs_sum_level;
     row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
     pthread_mutex_unlock(enc_row_mt_mutex_);
@@ -840,6 +842,7 @@
 
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
+    thread_data->td->abs_sum_level = 0;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -1810,14 +1813,14 @@
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-// Compare and order tiles based on tile size.
+// Compare and order tiles based on absolute sum of tx coeffs.
 static int compare_tile_order(const void *a, const void *b) {
   const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
   const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
 
-  if (tile_a->tile_size_mi > tile_b->tile_size_mi)
+  if (tile_a->abs_sum_level > tile_b->abs_sum_level)
     return -1;
-  else if (tile_a->tile_size_mi == tile_b->tile_size_mi)
+  else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
     return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
   else
     return 1;
@@ -2040,8 +2043,8 @@
 
   // Populate pack bitstream tile order structure
   for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
-    pack_bs_tile_order[tile_idx].tile_size_mi =
-        pack_bs_params[tile_idx].tile_size_mi;
+    pack_bs_tile_order[tile_idx].abs_sum_level =
+        cpi->tile_data[tile_idx].abs_sum_level;
     pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
   }