Use highest possible no. of threads for temporal filtering

During multi-thread encode, the maximum number of workers as per the
module processing constraints are allocated for temporal filtering.
Speed gains upto 2.8% seen for 720p single-tile encode using 10
threads.

Change-Id: Ic7b627dfb62c14f1c9611b1eff50771e705b9c58
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index b71a40a..bb0cd66 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -1813,7 +1813,19 @@
 
 // Computes num_workers for temporal filter multi-threading.
 static AOM_INLINE int compute_num_tf_workers(AV1_COMP *cpi) {
-  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  // For single-pass encode, using no. of workers as per tf block size was not
+  // found to improve speed. Hence the thread assignment for single-pass encode
+  // is kept based on compute_num_enc_workers().
+  if (cpi->oxcf.pass != 2)
+    return (compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+
+  if (cpi->oxcf.max_threads <= 1) return 1;
+
+  const int frame_height = cpi->common.height;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int mb_height = block_size_high[block_size];
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  return AOMMIN(cpi->oxcf.max_threads, mb_rows);
 }
 
 // Computes num_workers for tpl multi-threading.